diff --git a/projects/hipblaslt/CHANGELOG.md b/projects/hipblaslt/CHANGELOG.md index 91d8f63c19e..2a033488540 100644 --- a/projects/hipblaslt/CHANGELOG.md +++ b/projects/hipblaslt/CHANGELOG.md @@ -12,6 +12,7 @@ Full documentation for hipBLASLt is available at [rocm.docs.amd.com/projects/hip * Replaced `install.sh` with an invoke-based task runner (`tasks.py`) to support cross-platform builds including Windows (ROCm 7.0+). * gtest and msgpack-cxx are now fetched automatically via CMake FetchContent if not found on the system. +* Greatly improved MXFP4 GEMM performance when using HIPBLASLT_MATMUL_MATRIX_SCALE_BLK32_UE8M0_32_8_EXT ## hipBLASLt 1.2.2 for ROCm 7.2.1 diff --git a/projects/hipblaslt/clients/common/include/hipblaslt_datatype2string.hpp b/projects/hipblaslt/clients/common/include/hipblaslt_datatype2string.hpp index ec846a19e81..51cf32e62de 100644 --- a/projects/hipblaslt/clients/common/include/hipblaslt_datatype2string.hpp +++ b/projects/hipblaslt/clients/common/include/hipblaslt_datatype2string.hpp @@ -167,25 +167,21 @@ inline std::vector preTileSizeForScaleB(hipblaslt_scaling_format s) } } -// Compute scale buffer size accounting for padding by preSwizzleScalesGFX950. +// Compute scale buffer size with padding for block-scaled MX formats. // dataRow, dataCol are the raw data matrix dimensions (A_row/A_col or B_row/B_col). -// When pre-swizzle is active, the output may be larger than the unpadded size -// because rows are padded to a multiple of 32 and cols to a multiple of 8. +// Scale dimensions are padded to ensure kernels that process data in 32-element (M/N) +// or 256-element (K) blocks always have valid scale entries: +// scaleRows = ceil(dataRow / blockSize) rounded up to multiple of 8 +// scaleCols = dataCol rounded up to multiple of 32 +// When pre-swizzle is active, additional layout requirements may apply but are +// already satisfied by the rounding above. inline size_t scaleBufferSize(int64_t dataRow, int64_t dataCol, hipblaslt_scaling_format s) { auto bs = blockSize(s); - size_t scaleRows = dataRow / bs; - size_t scaleCols = dataCol; + size_t scaleRows = ((dataRow + bs - 1) / bs + 7) / 8 * 8; + size_t scaleCols = ((dataCol + 31) / 32) * 32; - auto preSwizzle = preSwizzleSizeForScale(s); - if(preSwizzle.empty()) - return scaleRows * scaleCols; - - // preSwizzleScalesGFX950 is called with {scaleCols, scaleRows}. - // It pads numRows (=scaleCols) to multiple of 32, numCols (=scaleRows) to multiple of 8. - size_t paddedNumRows = ((scaleCols + 31) / 32) * 32; - size_t paddedNumCols = ((scaleRows + 7) / 8) * 8; - return paddedNumRows * paddedNumCols; + return scaleRows * scaleCols; } inline hipblaslt_internal_ostream& operator<<(hipblaslt_internal_ostream& os, diff --git a/projects/hipblaslt/clients/common/include/norm.hpp b/projects/hipblaslt/clients/common/include/norm.hpp index 65e9329e632..966e331a067 100644 --- a/projects/hipblaslt/clients/common/include/norm.hpp +++ b/projects/hipblaslt/clients/common/include/norm.hpp @@ -33,6 +33,7 @@ #include "utility.hpp" #include #include +#include #include #include #include @@ -557,45 +558,60 @@ bool norm_check(double norm_error) return false; } -// TODO: norm_check determines the required norm solely based on -// To (type). This might cause tests to fail when the input -// matrices are MX types (F4/F8/F6). A better way is -// needed to determine the required norm for MX types. -bool norm_check(double norm_error, hipDataType type) +// TODO: tune norm tolerance for MX FP6 and FP8 types +double norm_tolerance(hipDataType type) { switch(type) { case HIP_R_32F: - return norm_error < 0.00001; + return 0.00001; case HIP_R_64F: - return norm_error < 0.000000000001; + return 0.000000000001; case HIP_R_16F: - return norm_error < 0.01; + return 0.01; case HIP_R_16BF: - return norm_error < 0.1; + return 0.1; case HIP_R_8F_E4M3_FNUZ: case HIP_R_8F_E4M3: - return norm_error < 0.125; + return 0.125; case HIP_R_8F_E5M2_FNUZ: case HIP_R_8F_E5M2: - return norm_error < 0.25; + return 0.25; case HIP_R_32I: - return norm_error < 0.0001; + return 0.0001; case HIP_R_8I: - return norm_error < 0.01; - // TODO: find a suitable rnom value for f6 and f4 + return 0.01; + case HIP_R_4F_E2M1: + return 0.3; case HIP_R_6F_E2M3: case HIP_R_6F_E3M2: - case HIP_R_4F_E2M1: - return norm_error < 0.5; + return 0.5; default: - return false; + return 0.0; } } -bool norm_check(double norm_error, hipDataType type, hipblasComputeType_t compute_type) +bool norm_check(double norm_error, hipDataType type) { - if(compute_type == HIPBLAS_COMPUTE_32F_FAST_16BF && type == HIP_R_32F) - return norm_error < 0.5; - return norm_check(norm_error, type); + double tol = norm_tolerance(type); + return tol > 0.0 && norm_error < tol; +} + +bool norm_check(double norm_error, + hipDataType outputType, + hipblasComputeType_t compute_type, + hipDataType inputTypeA = static_cast(-1), + hipDataType inputTypeB = static_cast(-1)) +{ + double tol = norm_tolerance(outputType); + + if(compute_type == HIPBLAS_COMPUTE_32F_FAST_16BF && outputType == HIP_R_32F) + tol = std::max(tol, 0.5); + + if(static_cast(inputTypeA) >= 0) + tol = std::max(tol, norm_tolerance(inputTypeA)); + if(static_cast(inputTypeB) >= 0) + tol = std::max(tol, norm_tolerance(inputTypeB)); + + return tol > 0.0 && norm_error < tol; } diff --git a/projects/hipblaslt/clients/common/include/testing_matmul.hpp b/projects/hipblaslt/clients/common/include/testing_matmul.hpp index 8526952ade3..5a93d9cf130 100644 --- a/projects/hipblaslt/clients/common/include/testing_matmul.hpp +++ b/projects/hipblaslt/clients/common/include/testing_matmul.hpp @@ -79,6 +79,15 @@ extern "C" __global__ void flush_icache() :); } +// Convert element count to byte count, accounting for sub-byte packing. +// FP4 (4-bit) packs 2 elements per byte; all other types use realDataTypeSize. +size_t elementsToBytes(size_t numElements, hipDataType dtype) +{ + if(static_cast(dtype) == HIP_R_4F_E2M1) + return numElements / 2; + return numElements * realDataTypeSize(dtype); +} + bool isSwizzleSupported(hipDataType datatype) { switch(datatype) @@ -1235,7 +1244,8 @@ void check(hipStream_t stream, hipblaslt_error += norm_error; if(arg.norm_check_assert) { - CHECK_SUCCESS(norm_check(norm_error, To, arg.compute_type)); + CHECK_SUCCESS( + norm_check(norm_error, To, arg.compute_type, arg.a_type, arg.b_type)); } if(batchMode != HIPBLASLT_BATCH_MODE_POINTER_ARRAY) { @@ -1269,7 +1279,8 @@ void check(hipStream_t stream, hipblaslt_error += norm_error; if(arg.norm_check_assert) { - CHECK_SUCCESS(norm_check(norm_error, Taux, arg.compute_type)); + CHECK_SUCCESS( + norm_check(norm_error, Taux, arg.compute_type, arg.a_type, arg.b_type)); } } if(arg.gradient && arg.bias_vector) @@ -2248,8 +2259,8 @@ void testing_matmul_with_bias(const Arguments& arg, } else if(isBlockScaling(arg.scaleA)) { - // For MX format, use uin8_t for the scale (E8M0) - dScaleA.emplace_back(HIP_R_8U, size_scaleAVec[i] * block_count, HMM); + // For MX format, use uint8_t for the scale (E8M0), allocate for all batches + dScaleA.emplace_back(HIP_R_8U, size_scaleAVec[i] * num_batches[i] * block_count, HMM); CHECK_DEVICE_ALLOCATION(hipGetLastError()); } if(arg.scaleB == hipblaslt_scaling_format::Scalar @@ -2260,8 +2271,8 @@ void testing_matmul_with_bias(const Arguments& arg, } else if(isBlockScaling(arg.scaleB)) { - // For MX format, use uin8_t for the scale (E8M0) - dScaleB.emplace_back(HIP_R_8U, size_scaleBVec[i] * block_count, HMM); + // For MX format, use uint8_t for the scale (E8M0), allocate for all batches + dScaleB.emplace_back(HIP_R_8U, size_scaleBVec[i] * num_batches[i] * block_count, HMM); CHECK_DEVICE_ALLOCATION(hipGetLastError()); } if(arg.scaleC) @@ -2312,7 +2323,7 @@ void testing_matmul_with_bias(const Arguments& arg, } else if(isBlockScaling(arg.scaleA)) { - hScaleA.emplace_back(HIP_R_8U, size_scaleAVec[i]); + hScaleA.emplace_back(HIP_R_8U, size_scaleAVec[i] * num_batches[i]); } if(arg.scaleB == hipblaslt_scaling_format::Scalar || arg.scaleB == hipblaslt_scaling_format::Vector) @@ -2321,7 +2332,7 @@ void testing_matmul_with_bias(const Arguments& arg, } else if(isBlockScaling(arg.scaleB)) { - hScaleB.emplace_back(HIP_R_8U, size_scaleBVec[i]); + hScaleB.emplace_back(HIP_R_8U, size_scaleBVec[i] * num_batches[i]); } if(arg.scaleC) hScaleC.emplace_back(Talpha, 1); @@ -2507,23 +2518,34 @@ void testing_matmul_with_bias(const Arguments& arg, // (consists of data part and scale part) // preTile for A: {tileK, tileM} - swap from preTileSizeForScaleA which returns {tileM, tileK} auto preTileATmp = preTileSizeForScaleA(arg.scaleA); - auto preTileA = (preTileATmp.size() == 2) - ? std::vector{preTileATmp[1], preTileATmp[0]} - : std::vector{}; - refA.emplace_back(generateMXInput(TiA, - scaleDataType(arg.scaleA), - hA[i].buf(), - hScaleA[i].buf(), - A_row[i], - A_col[i], - lda[i], - transA == HIPBLAS_OP_T, - preSwizzleSizeForScale(arg.scaleA), - preTileA, - blockSize(arg.scaleA), - 1, - true, - hipblaslt_initialization2string(arg.initialization))); + auto preTileA = (preTileATmp.size() == 2) ? std::vector{preTileATmp[1], preTileATmp[0]} : std::vector{}; + // Compute batch strides in bytes for data and scale buffers. + size_t dataBatchBytesA = (num_batches[i] > 1) ? elementsToBytes(stride_a[i], TiA) : 0; + size_t scaleBatchBytesA = (num_batches[i] > 1) ? size_scaleAVec[i] : 0; + // Generate MX data for each batch and collect reference floats + std::vector refAAll; + refAAll.reserve(static_cast(A_row[i]) * A_col[i] * num_batches[i]); + for(int64_t b = 0; b < num_batches[i]; b++) + { + auto* dataPtr = reinterpret_cast(hA[i].buf()) + b * dataBatchBytesA; + auto* scalePtr = reinterpret_cast(hScaleA[i].buf()) + b * scaleBatchBytesA; + auto batchRef = generateMXInput(TiA, + scaleDataType(arg.scaleA), + dataPtr, + scalePtr, + A_row[i], + A_col[i], + lda[i], + transA == HIPBLAS_OP_T, + preSwizzleSizeForScale(arg.scaleA), + preTileA, + blockSize(arg.scaleA), + 1, + true, + hipblaslt_initialization2string(arg.initialization)); + refAAll.insert(refAAll.end(), batchRef.begin(), batchRef.end()); + } + refA.emplace_back(std::move(refAAll)); // Copy data and scale to device buffers CHECK_HIP_ERROR(synchronize(dA[i], hA[i], block_count)); CHECK_HIP_ERROR(synchronize(dScaleA[i], hScaleA[i], block_count)); @@ -2609,20 +2631,33 @@ void testing_matmul_with_bias(const Arguments& arg, // input data (consists of data part and scale part) // preTile for B: {tileK, tileN} auto preTileB = preTileSizeForScaleB(arg.scaleB); - refB.emplace_back(generateMXInput(TiB, - scaleDataType(arg.scaleB), - hB[i].buf(), - hScaleB[i].buf(), - B_row[i], - B_col[i], - ldb[i], - transB == HIPBLAS_OP_T, - preSwizzleSizeForScale(arg.scaleB), - preTileB, - 1, - blockSize(arg.scaleB), - false, - hipblaslt_initialization2string(arg.initialization))); + // Compute batch strides in bytes for data and scale buffers. + size_t dataBatchBytesB = (num_batches[i] > 1) ? elementsToBytes(stride_b[i], TiB) : 0; + size_t scaleBatchBytesB = (num_batches[i] > 1) ? size_scaleBVec[i] : 0; + // Generate MX data for each batch and collect reference floats + std::vector refBAll; + refBAll.reserve(static_cast(B_row[i]) * B_col[i] * num_batches[i]); + for(int64_t b = 0; b < num_batches[i]; b++) + { + auto* dataPtr = reinterpret_cast(hB[i].buf()) + b * dataBatchBytesB; + auto* scalePtr = reinterpret_cast(hScaleB[i].buf()) + b * scaleBatchBytesB; + auto batchRef = generateMXInput(TiB, + scaleDataType(arg.scaleB), + dataPtr, + scalePtr, + B_row[i], + B_col[i], + ldb[i], + transB == HIPBLAS_OP_T, + preSwizzleSizeForScale(arg.scaleB), + preTileB, + 1, + blockSize(arg.scaleB), + false, + hipblaslt_initialization2string(arg.initialization)); + refBAll.insert(refBAll.end(), batchRef.begin(), batchRef.end()); + } + refB.emplace_back(std::move(refBAll)); // Copy data and scale to device buffers CHECK_HIP_ERROR(synchronize(dB[i], hB[i], block_count)); CHECK_HIP_ERROR(synchronize(dScaleB[i], hScaleB[i], block_count)); @@ -4595,7 +4630,7 @@ void testing_matmul_with_bias(const Arguments& arg, lda[gemmIdx], isScaleBMXFormat ? reinterpret_cast(refB[gemmIdx].data()) - + stride_a[gemmIdx] * batchIdx * realDataTypeSize(HIP_R_32F) + + stride_b[gemmIdx] * batchIdx * realDataTypeSize(HIP_R_32F) : hB[gemmIdx].as() + stride_b[gemmIdx] * batchIdx * realDataTypeSize(TiB), ldb[gemmIdx], diff --git a/projects/hipblaslt/clients/common/src/mxDataGen.cpp b/projects/hipblaslt/clients/common/src/mxDataGen.cpp index fede1dbd701..4b6ee6a7bff 100644 --- a/projects/hipblaslt/clients/common/src/mxDataGen.cpp +++ b/projects/hipblaslt/clients/common/src/mxDataGen.cpp @@ -342,6 +342,7 @@ std::vector generateMXInput(hipDataType dataType, opt.min = initMethod == "uniform_01" ? 0. : (initMethod == "hpl" ? -.5 : min_val); opt.max = initMethod == "uniform_01" ? 1. : (initMethod == "hpl" ? .5 : max_val); opt.blockScaling = scaleBlockRowSize * scaleBlockColSize; + opt.forceDenorm = false; // Map string initMethod to DataInitMode if(initMethod == "Sequential") diff --git a/projects/hipblaslt/clients/tests/data/matmul_gtest.yaml b/projects/hipblaslt/clients/tests/data/matmul_gtest.yaml index 0bba4cfe96c..9a2883a36e3 100755 --- a/projects/hipblaslt/clients/tests/data/matmul_gtest.yaml +++ b/projects/hipblaslt/clients/tests/data/matmul_gtest.yaml @@ -2531,8 +2531,6 @@ Tests: - { a_type: f8_r, b_type: f4_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 3, scaleB: 3, scale_type: f32_r} - { a_type: f4_r, b_type: f4_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 3, scaleB: 3, scale_type: f32_r} - { a_type: f6_r, b_type: f6_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 3, scaleB: 3, scale_type: f32_r} - - { a_type: f4_r, b_type: f4_r, c_type: bf16_r, d_type: bf16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} - - { a_type: f4_r, b_type: f4_r, c_type: bf16_r, d_type: bf16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r, swizzle_a: true} M: [3072] N: [3072] K: [16384] @@ -2546,24 +2544,6 @@ Tests: requested_solution_num: 10 gpu_arch: '950' -- name: matmul_mx_large_k_small_m_n - category: nightly - function: - matmul: - - { a_type: f4_r, b_type: f4_r, c_type: bf16_r, d_type: bf16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r, swizzle_a: true} - M: [128] - N: [128] - K: [222464] - transA: T - transB: N - alpha: 1.0 - beta: 0.0 - initialization: trig_float - unit_check: 0 - norm_check: 1 - requested_solution_num: -1 - gpu_arch: '950' - - name: matmul_mx_solution_index category: pre_checkin function: @@ -2605,31 +2585,6 @@ Tests: requested_solution_num: 3 gpu_arch: '950' -# MX datatypes with pre-swizzle for swizzle tile {32, 8} (scaleA/B: 1001 = Block_32_UE8M0_32_8_EXT) -# Tests the preSwizzleScale functionality -# Uses 16x16x128 MI instruction (subTileK = 128/32 = 4) -# TODO: Re-enable f8xf4 and f6xf6 -- name: matmul_mx_preswizzle_32x8_large - category: pre_checkin - function: - matmul: -# - { a_type: f8_r, b_type: f4_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} - - { a_type: f4_r, b_type: f4_r, c_type: bf16_r, d_type: bf16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} -# - { a_type: f6_r, b_type: f6_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} - M: [4096] - N: [4352] - K: [16384] - transA: T - transB: N - alpha: 1.0 - beta: 0.0 - initialization: trig_float - unit_check: 0 - norm_check: 1 - algo_method: [2] - solution_index: 0x82880808 # Workgroup tile size = 256X256x256 - gpu_arch: '950' - - name: matmul_relu_clamp_useE category: pre_checkin function: @@ -2681,23 +2636,241 @@ Tests: beta: [ 0.0, 1.0 ] gpu_arch: '90a' -# This is for testing MX FP4 kernel using Tensile -- name: matmul_tensile_fp4 +# Subtile MXFP4 tests (gfx950) +# All use preswizzled scales (scaleA/B: 1001 = Block_32_UE8M0_32_8_EXT) +# Build: ./install.sh -dck -a gfx950 +# Run: ./hipblaslt-test --gtest_filter="*subtile_fp4*" + +# Basic correctness — BF16 + F16 output, batched +- name: matmul_subtile_fp4_basic category: quick function: matmul: - - { a_type: f4_r, b_type: f4_r, c_type: f32_r, d_type: f32_r, compute_type: c_f32_r, scaleA: 3, scaleB: 3, scale_type: f32_r} - M: [2048] - N: [2048] - K: [4096] + - { a_type: f4_r, b_type: f4_r, c_type: bf16_r, d_type: bf16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + - { a_type: f4_r, b_type: f4_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + M: [128, 256, 4096] + N: [128, 256, 4096] + K: [256, 512] transA: T transB: N alpha: 1.0 beta: 0.0 - initialization: hpl + batch_count: [1, 2, 3] + initialization: trig_float + unit_check: 0 + norm_check: 1 + requested_solution_num: 1 + gpu_arch: '950' + +# Edge cases — k*64, non-multiple-of-32 (mult of 16) +- name: matmul_subtile_fp4_edges + category: pre_checkin + function: + matmul: + - { a_type: f4_r, b_type: f4_r, c_type: bf16_r, d_type: bf16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + - { a_type: f4_r, b_type: f4_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + M: [48, 64, 112, 192, 320, 448] + N: [48, 64, 112, 192, 320, 448] + K: [256] + transA: T + transB: N + alpha: 1.0 + beta: 0.0 + batch_count: [1, 2] + initialization: trig_float + unit_check: 0 + norm_check: 1 + requested_solution_num: 1 + gpu_arch: '950' + +# Odd and non-multiple-of-16 sizes +- name: matmul_subtile_fp4_odd + category: pre_checkin + function: + matmul: + - { a_type: f4_r, b_type: f4_r, c_type: bf16_r, d_type: bf16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + - { a_type: f4_r, b_type: f4_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + matrix_size: + - { M: 17, N: 33, K: 256 } + - { M: 63, N: 63, K: 256 } + - { M: 97, N: 129, K: 256 } + - { M: 33, N: 128, K: 256 } + - { M: 128, N: 33, K: 256 } + - { M: 50, N: 100, K: 256 } + - { M: 66, N: 128, K: 256 } + - { M: 128, N: 66, K: 256 } + - { M: 160, N: 160, K: 256 } + - { M: 288, N: 288, K: 256 } + - { M: 33, N: 65, K: 256 } + transA: T + transB: N + alpha: 1.0 + beta: 0.0 + batch_count: [1, 2] + initialization: trig_float + unit_check: 0 + norm_check: 1 + requested_solution_num: 1 + gpu_arch: '950' + +# Large DepthU (K multiple of 512) +- name: matmul_subtile_fp4_large_du + category: pre_checkin + function: + matmul: + - { a_type: f4_r, b_type: f4_r, c_type: bf16_r, d_type: bf16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + - { a_type: f4_r, b_type: f4_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + M: [128, 256, 192, 320] + N: [128, 256, 192, 320] + K: [512, 1024] + transA: T + transB: N + alpha: 1.0 + beta: 0.0 + batch_count: [1, 2] + initialization: trig_float + unit_check: 0 + norm_check: 1 + requested_solution_num: 1 + gpu_arch: '950' + +# F32 output — full tiles, edges, batched +- name: matmul_subtile_fp4_f32_output + category: pre_checkin + function: + matmul: + - { a_type: f4_r, b_type: f4_r, c_type: f32_r, d_type: f32_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + M: [48, 128, 192, 256, 320] + N: [48, 128, 192, 256, 320] + K: [256, 512] + transA: T + transB: N + alpha: 1.0 + beta: 0.0 + batch_count: [1, 2, 3] + initialization: trig_float + unit_check: 0 + norm_check: 1 + requested_solution_num: 1 + gpu_arch: '950' + +# F32 output — odd/non-aligned sizes +- name: matmul_subtile_fp4_f32_output_odd + category: pre_checkin + function: + matmul: + - { a_type: f4_r, b_type: f4_r, c_type: f32_r, d_type: f32_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + matrix_size: + - { M: 17, N: 33, K: 256 } + - { M: 63, N: 63, K: 256 } + - { M: 50, N: 100, K: 256 } + - { M: 66, N: 66, K: 256 } + transA: T + transB: N + alpha: 1.0 + beta: 0.0 + batch_count: [1, 2] + initialization: trig_float + unit_check: 0 + norm_check: 1 + requested_solution_num: 1 + gpu_arch: '950' + +# Bias — BF16, F16, F32 output +# TODO: activation_type [relu, gelu] fails with batch_count > 1 for some sizes — kernel bug +- name: matmul_subtile_fp4_bias + category: nightly + function: + matmul: + - { a_type: f4_r, b_type: f4_r, c_type: bf16_r, d_type: bf16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + - { a_type: f4_r, b_type: f4_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + - { a_type: f4_r, b_type: f4_r, c_type: f32_r, d_type: f32_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + M: [128, 192] + N: [128, 192] + K: [256] + transA: T + transB: N + alpha: 1.0 + beta: [0.0, 2.0] + bias_vector: 1 + batch_count: [1, 2] + initialization: trig_float + unit_check: 0 + norm_check: 1 + requested_solution_num: 1 + gpu_arch: '950' + +# ScaleAlphaVec + bias + activations +- name: matmul_subtile_fp4_scaleAlphaVec + category: nightly + function: + matmul: + - { a_type: f4_r, b_type: f4_r, c_type: bf16_r, d_type: bf16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + - { a_type: f4_r, b_type: f4_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + - { a_type: f4_r, b_type: f4_r, c_type: f32_r, d_type: f32_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + M: [128, 192] + N: [128, 192] + K: [256] + transA: T + transB: N + alpha: 1.0 + beta: [0.0, 2.0] + bias_vector: 1 + scaleAlpha_vector: 1 + activation_type: [none, relu, gelu, sigmoid, swish] + initialization: trig_float + unit_check: 0 + norm_check: 1 + requested_solution_num: 1 + gpu_arch: '950' + +# TODO: clamp activation not applied by GPU for FP4 subtile — hipblaslt dispatch +# finds a solution but ignores HIPBLASLT_EPILOGUE_CLAMP_EXT. GPU output is unclamped +# while CPU reference correctly clamps. Not a test bug — library dispatch issue. + +# Asymmetric M/N +- name: matmul_subtile_fp4_asymmetric + category: pre_checkin + function: + matmul: + - { a_type: f4_r, b_type: f4_r, c_type: bf16_r, d_type: bf16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + - { a_type: f4_r, b_type: f4_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + matrix_size: + - { M: 192, N: 256, K: 256 } + - { M: 256, N: 192, K: 256 } + - { M: 48, N: 128, K: 256 } + - { M: 128, N: 48, K: 256 } + - { M: 64, N: 4096, K: 256 } + - { M: 4096, N: 64, K: 256 } + transA: T + transB: N + alpha: 1.0 + beta: 0.0 + initialization: trig_float unit_check: 0 norm_check: 1 requested_solution_num: 1 gpu_arch: '950' +# Large sizes + multiple solutions (nightly) +- name: matmul_subtile_fp4_large + category: nightly + function: + matmul: + - { a_type: f4_r, b_type: f4_r, c_type: bf16_r, d_type: bf16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + - { a_type: f4_r, b_type: f4_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + - { a_type: f4_r, b_type: f4_r, c_type: f32_r, d_type: f32_r, compute_type: c_f32_r, scaleA: 1001, scaleB: 1001, scale_type: f32_r} + M: [3072, 4096] + N: [3072, 4096] + K: [16384] + transA: T + transB: N + alpha: 1.0 + beta: 0.0 + initialization: trig_float + unit_check: 0 + norm_check: 1 + requested_solution_num: 10 + gpu_arch: '950' + ... diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 00000000000..d85da542b2e --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,16950 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 21 + DataTypeA: 21 + DataTypeAmaxD: 0 + DataTypeB: 21 + DataTypeE: 7 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index01MXSA: 0 + Index01MXSB: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMXSA: [3, 0, 2] + IndexAssignmentsMXSB: [3, 1, 2] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndexUnrollMXSA: 0 + IndexUnrollMXSB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 32 + MXBlockB: 32 + MacDataTypeA: 21 + MacDataTypeB: 21 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + TLUMXSA: false + TLUMXSB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hIsnhFT2L5uw_T8ryWf3mri6PtKpwJ1E97MC3rNtxpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileMXSA: 64 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128AMjoHHGnVSJ1fjvuIJeKPVtejro7lpRL6TixtWnDac= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileMXSA: 128 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + ThreadTileMXSA: 16 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19tVFE7Mochg2Q71HzQS4tIGctt0Td3p2SmmxHjjWSYTc= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MacroTileMXSA: 192 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25YgPR4XPaMfFjkFVdEJNxrCtJ2m9IQRE3ggDN5hyNKfg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 36864 + LdsOffsetB_Blk: 106496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 102400 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 106496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileMXSA: 256 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 256 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + ThreadTileMXSA: 32 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64yR3-UwlktRfU59ACgTfmpcg_Yk6f2Lb1knA4QU-BsnY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9728 + LdsOffsetB_Blk: 42496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 42496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileMXSA: 64 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12ZclQEHdRRO1sBJL3TwidM33yIt-WYnDtDgY9P9JYRX4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 50688 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17920 + LdsOffsetMetadata_Blk: 50688 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileMXSA: 128 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hJRTC6QzyytWkHaKB_tyuSMxvHfrHLHmDLGhNVVcILE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 75776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 75776 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MacroTileMXSA: 64 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19UCUD2vak2YLHvH_Ztg5zXPDJUlX3NzBJWRuGhvKVzu8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26624 + LdsOffsetB_Blk: 92160 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 92160 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MacroTileMXSA: 192 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64sCK37FdA-O3yvoTtVtg3QFjEpjbAv0RNmVuI_Ry968A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10752 + LdsOffsetB_Blk: 76288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10752 + LdsOffsetMetadata_Blk: 76288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MacroTileMXSA: 64 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25Tmym63Cp1OVwmuklAuBYLzHtSt0oUezVbK-tmV8hqYs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35328 + LdsOffsetB_Blk: 100864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 100864 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileMXSA: 256 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Qs31lTSyFaUbDqXQHGpUViUgkCHLEWWfNifVlN3xPcA= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18944 + LdsOffsetB_Blk: 84480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18944 + LdsOffsetMetadata_Blk: 84480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MacroTileMXSA: 128 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT191Q3CWl85SU-4agRfSbxDDRYJaTIDmYF_adG8G4wdfNo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27136 + LdsOffsetB_Blk: 92672 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 92672 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MacroTileMXSA: 192 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12kwdCHpnp9d41ufLMEwnWbNNXbmlKzUrCttuc0YSlgSY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19456 + LdsOffsetB_Blk: 84992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19456 + LdsOffsetMetadata_Blk: 84992 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileMXSA: 128 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + ThreadTileMXSA: 16 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25s2SqPqjqQTrutnl4Fjsck3utFxYLnrt2GxRGoMabbq0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileMXSA: 256 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + ThreadTileMXSA: 32 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19--ULatdGkaEyMxugPoO0m7KB92W8g0H0S_IE47lq-mE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MacroTileMXSA: 192 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + ThreadTileMXSA: 24 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25H-6yQcCGxSKwn5gmGJAHA93oK-ekIRJm5JqDOya5ASk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36352 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36352 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MacroTileMXSA: 256 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + ThreadTileMXSA: 32 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT649Xx58paO0aNamHRyhpgRWxjDG0kBNJ5sAnNiMy2USyI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11264 + LdsOffsetB_Blk: 76800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11264 + LdsOffsetMetadata_Blk: 76800 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 10] + MIWaveTileA: 2 + MIWaveTileB: 10 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 + MacroTileMXSA: 64 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 10 + ThreadTileA: 8 + ThreadTileB: 10 + ThreadTileMXSA: 8 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32yn2lSZhgGG39-O0tN9XB-c6Nr5kcbdLKPNrKxqSEhVE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44032 + LdsOffsetB_Blk: 109568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44032 + LdsOffsetMetadata_Blk: 109568 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 64 + MacroTileA: 320 + MacroTileB: 64 + MacroTileMXSA: 320 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + ThreadTileMXSA: 40 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64cx8jpYzIAVvV24Gqg4hoX8QdixxbIDmUlvaleHLuajs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11776 + LdsOffsetB_Blk: 77312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11776 + LdsOffsetMetadata_Blk: 77312 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 12] + MIWaveTileA: 2 + MIWaveTileB: 12 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 384 + MacroTileA: 64 + MacroTileB: 384 + MacroTileMXSA: 64 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 12 + ThreadTileA: 8 + ThreadTileB: 12 + ThreadTileMXSA: 8 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38m0Bm4hH2y_fb6-eo4XZRKb5dkkBhjv6q0Rd-CAzTq-Q= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52736 + LdsOffsetB_Blk: 118272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 118272 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 64 + MacroTileA: 384 + MacroTileB: 64 + MacroTileMXSA: 384 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + ThreadTileMXSA: 48 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64ghBzLJyXgx9oTwgzYUUty2EUDAM9Q1qjBKNRjSDH1Qg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 86528 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 16] + MIWaveTileA: 2 + MIWaveTileB: 16 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 + MacroTileMXSA: 64 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + ThreadTileMXSA: 8 + ThreadTileMXSB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51N7xP8nenZXqX1PoWvPtuxa4rTpP69DwZBgRtHN8axrQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 70144 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 143872 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 147968 + LdsOffsetMetadata: 70144 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [16, 2] + MIWaveTileA: 16 + MIWaveTileB: 2 + MIWaveTileMXSA: 16 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 64 + MacroTileA: 512 + MacroTileB: 64 + MacroTileMXSA: 512 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + ThreadTileMXSA: 64 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Tzx41lFmgkHO5_GDlXlBjYPnuHfWnU6iLujoAD27WHo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19968 + LdsOffsetB_Blk: 85504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 85504 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 320 + MacroTileA: 128 + MacroTileB: 320 + MacroTileMXSA: 128 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT329TK1O9xqAG2cUdkJ_6Jkg-kbMKNtFWKGnhC0gVHbf3c= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44544 + LdsOffsetB_Blk: 110080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 110080 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 128 + MacroTileA: 320 + MacroTileB: 128 + MacroTileMXSA: 320 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Ko14x9Bm17skxIK2J-9eSMhxZf1WQTws5W6lZ4ij_RU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MacroTileMXSA: 128 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + ThreadTileMXSA: 16 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38PfCBP4cNoa67MDHKbRKQKP0baWWAor2QL1_1-ioSnxM= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 53248 + LdsOffsetB_Blk: 122880 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 118784 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 122880 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 128 + MacroTileA: 384 + MacroTileB: 128 + MacroTileMXSA: 384 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + ThreadTileMXSA: 48 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32GHMtx7cclUEZq8HXp3EV83TsDigTCOy4HpQzcwTdNNQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 45568 + LdsOffsetB_Blk: 123904 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 119296 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 45568 + LdsOffsetMetadata_Blk: 123904 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 8] + MIWaveTileA: 10 + MIWaveTileB: 8 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 256 + MacroTileA: 320 + MacroTileB: 256 + MacroTileMXSA: 320 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 8 + ThreadTileA: 40 + ThreadTileB: 8 + ThreadTileMXSA: 40 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25yENsNJL4B9Vb5GlLD7ofBX9eYUJtt3JL_VzKdaPOs5g= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 37376 + LdsOffsetB_Blk: 115712 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 111104 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 115712 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 10] + MIWaveTileA: 8 + MIWaveTileB: 10 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 + MacroTileMXSA: 256 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 10 + ThreadTileA: 32 + ThreadTileB: 10 + ThreadTileMXSA: 32 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64IqbWxZkPcZWy3vA8g9HsDHfQ7n72vJptvmQ79eUQ7kk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 14] + MIWaveTileA: 2 + MIWaveTileB: 14 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 448 + MacroTileA: 64 + MacroTileB: 448 + MacroTileMXSA: 64 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 14 + ThreadTileA: 8 + ThreadTileB: 14 + ThreadTileMXSA: 8 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44JIOkXLbpjhS22SFIEupmgY9bhtzcXu69kfhwBJBO5wo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 61440 + LdsOffsetB_Blk: 131072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 126976 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 131072 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 2] + MIWaveTileA: 14 + MIWaveTileB: 2 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 64 + MacroTileA: 448 + MacroTileB: 64 + MacroTileMXSA: 448 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 2 + ThreadTileA: 56 + ThreadTileB: 2 + ThreadTileMXSA: 56 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12NCUb78G7QzN6ng2b3-8rphTRtJXo20S1KvmwH6k1FLQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 94720 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 95744 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 + MacroTileMXSA: 128 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + ThreadTileMXSA: 16 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44K2rcrzW2gkNghqcoHIxk3lU0xSXGNO6q0t-lzqdm54M= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 61952 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 135680 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 139264 + LdsOffsetMetadata: 61952 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 4] + MIWaveTileA: 14 + MIWaveTileB: 4 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 128 + MacroTileA: 448 + MacroTileB: 128 + MacroTileMXSA: 448 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 4 + ThreadTileA: 56 + ThreadTileB: 4 + ThreadTileMXSA: 56 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT197rqXtfRTrgckooWIHV9QRZkAUvBYDajyR7fyXPSaOfI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 29184 + LdsOffsetB_Blk: 107520 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 102912 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 107520 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 12] + MIWaveTileA: 6 + MIWaveTileB: 12 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MacroTileMXSA: 192 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 12 + ThreadTileA: 24 + ThreadTileB: 12 + ThreadTileMXSA: 24 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38M4Hg58ewW76BBGjsIKCCfd9HK3CXsd22WqnddQrMdIo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 53760 + LdsOffsetB_Blk: 132096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 127488 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 53760 + LdsOffsetMetadata_Blk: 132096 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 6] + MIWaveTileA: 12 + MIWaveTileB: 6 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 192 + MacroTileA: 384 + MacroTileB: 192 + MacroTileMXSA: 384 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 6 + ThreadTileA: 48 + ThreadTileB: 6 + ThreadTileMXSA: 48 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12pSBqfd5dzj78eX1t5EQJzYZ5aHFVtQ9uJHS361sw3qY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17664 + LdsOffsetB_Blk: 50432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 50432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileMXSA: 128 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12lYJO4P9-MC57Obi2kvcZAOwe_1ysdtWgX7-VIBNgQpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18176 + LdsOffsetB_Blk: 50944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 18176 + LdsOffsetMetadata_Blk: 50944 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileMXSA: 128 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25ccxCqQ2dvGclrJYn0QqExpGJSq56GiHUJPL0jzB0qVw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35072 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35072 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileMXSA: 256 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25LD47Rd1FCRkyTzzAeZkhMOPqncN77I3VtqjftUOo3W0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35584 + LdsOffsetB_Blk: 101120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 101120 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MacroTileMXSA: 256 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT252kruXsNK48kysY3byfZ8nkfbVQgYyySEfltsnGOOr7I= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 5 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 160 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36096 + LdsOffsetB_Blk: 101632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 101632 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MacroTileMXSA: 256 + MacroTileMXSB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25K1mXD-Z64cKGl1n64cEhRIvEdarIIYY6qzM6iXFNutU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 9 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 288 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 37120 + LdsOffsetB_Blk: 111104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 106752 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 108800 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 111104 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 18] + MIWaveTileA: 4 + MIWaveTileB: 18 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 18 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MacroTileMXSA: 256 + MacroTileMXSB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 18 + ThreadTileA: 16 + ThreadTileB: 18 + ThreadTileMXSA: 16 + ThreadTileMXSB: 18 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT382wXw8HnHr0540PBKyCMkiG1GnHC8Qq6QTA2UY90u0Us= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52480 + LdsOffsetB_Blk: 118016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52480 + LdsOffsetMetadata_Blk: 118016 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 32 + MacroTileA: 384 + MacroTileB: 32 + MacroTileMXSA: 384 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38iWtaR8v4gfASTnshP-7Zt37fJ-3bIKngSBlzEMd0qgE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52992 + LdsOffsetB_Blk: 118528 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52992 + LdsOffsetMetadata_Blk: 118528 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MacroTileMXSA: 384 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51My4B9dUWEKuVinUMAqyTn-9nVKbOgw4LZNuajpvbgno= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 69888 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 139520 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 143616 + LdsOffsetMetadata: 69888 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 32 + MacroTileA: 512 + MacroTileB: 32 + MacroTileMXSA: 512 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32dv3nH_ug3zWfzXFJEJHJUvkpjXQjAQVml90QMZvwYvs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5376 + LdsOffsetB_Blk: 38144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 36864 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 37120 + LdsOffsetMetadata: 5376 + LdsOffsetMetadata_Blk: 38144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileMXSA: 32 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96VdJ139l5ExTu5vrbSyr9YIDnqGsFY6rZMfGBgYGwyok= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14080 + LdsOffsetB_Blk: 46848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 45056 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 45824 + LdsOffsetMetadata: 14080 + LdsOffsetMetadata_Blk: 46848 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileMXSA: 96 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32pDTSO9Ei8iprYUZVwZ8MVK8XTu8S8ugrlkQJj7C1FJQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 6400 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileMXSA: 32 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96Cj2Z80w1PkWDv8Q4NtOmErGoRYnlYoT1xjNw40Qn6y8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15104 + LdsOffsetB_Blk: 80640 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 15104 + LdsOffsetMetadata_Blk: 80640 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MacroTileMXSA: 96 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT16rkdIx0xJ9AdmxSLTSuPL_HwvXURN3d9JBsT0i2N2r4A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 5 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 160 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1280 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23808 + LdsOffsetB_Blk: 89344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 20480 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 21760 + LdsOffsetMXSB_Blk: 87296 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 89344 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MacroTileMXSA: 160 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT28HX9AX04H5gx2Hnt9CFkwE4Io5BNr8Him7sxPkQ6FxSI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 9 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 288 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2304 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 41216 + LdsOffsetB_Blk: 115200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 36864 + LdsOffsetMXSA_Blk: 110848 + LdsOffsetMXSB: 39168 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 115200 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [18, 4] + MIWaveTileA: 18 + MIWaveTileB: 4 + MIWaveTileMXSA: 18 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 256 + MacroTileA: 288 + MacroTileB: 256 + MacroTileMXSA: 288 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 72 + ThreadTile1: 4 + ThreadTileA: 72 + ThreadTileB: 4 + ThreadTileMXSA: 72 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT323cjBEjqTj-Mu9ZF1RsqJ73WqkJOeA3kpk9e2rgOyJEQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7424 + LdsOffsetB_Blk: 72960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 7424 + LdsOffsetMetadata_Blk: 72960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 384 + MacroTileA: 32 + MacroTileB: 384 + MacroTileMXSA: 32 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32k1L8EqrYflL4kvotLP49gssYPXLs5aDdnaskyCeVRw4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 78080 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 512 + MacroTileA: 32 + MacroTileB: 512 + MacroTileMXSA: 32 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT9680-vsketBKOwATFxdutsK5i_NcnvLOonACpwOS25BUY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MacroTileMXSA: 96 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4HS_MXA32_MXB32_BH_BiasSH_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4HS_MXA32_MXB32_BH_BiasSH_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 00000000000..dd56492fd9a --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4HS_MXA32_MXB32_BH_BiasSH_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,16950 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 4] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 21 + DataTypeA: 21 + DataTypeAmaxD: 0 + DataTypeB: 21 + DataTypeE: 4 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index01MXSA: 0 + Index01MXSB: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMXSA: [3, 0, 2] + IndexAssignmentsMXSB: [3, 1, 2] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndexUnrollMXSA: 0 + IndexUnrollMXSB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 32 + MXBlockB: 32 + MacDataTypeA: 21 + MacDataTypeB: 21 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + TLUMXSA: false + TLUMXSB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hIsnhFT2L5uw_T8ryWf3mri6PtKpwJ1E97MC3rNtxpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileMXSA: 64 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128AMjoHHGnVSJ1fjvuIJeKPVtejro7lpRL6TixtWnDac= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileMXSA: 128 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + ThreadTileMXSA: 16 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19tVFE7Mochg2Q71HzQS4tIGctt0Td3p2SmmxHjjWSYTc= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MacroTileMXSA: 192 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25YgPR4XPaMfFjkFVdEJNxrCtJ2m9IQRE3ggDN5hyNKfg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 36864 + LdsOffsetB_Blk: 106496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 102400 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 106496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileMXSA: 256 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 256 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + ThreadTileMXSA: 32 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64yR3-UwlktRfU59ACgTfmpcg_Yk6f2Lb1knA4QU-BsnY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9728 + LdsOffsetB_Blk: 42496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 42496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileMXSA: 64 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12ZclQEHdRRO1sBJL3TwidM33yIt-WYnDtDgY9P9JYRX4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 50688 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17920 + LdsOffsetMetadata_Blk: 50688 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileMXSA: 128 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hJRTC6QzyytWkHaKB_tyuSMxvHfrHLHmDLGhNVVcILE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 75776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 75776 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MacroTileMXSA: 64 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19UCUD2vak2YLHvH_Ztg5zXPDJUlX3NzBJWRuGhvKVzu8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26624 + LdsOffsetB_Blk: 92160 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 92160 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MacroTileMXSA: 192 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64sCK37FdA-O3yvoTtVtg3QFjEpjbAv0RNmVuI_Ry968A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10752 + LdsOffsetB_Blk: 76288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10752 + LdsOffsetMetadata_Blk: 76288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MacroTileMXSA: 64 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25Tmym63Cp1OVwmuklAuBYLzHtSt0oUezVbK-tmV8hqYs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35328 + LdsOffsetB_Blk: 100864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 100864 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileMXSA: 256 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Qs31lTSyFaUbDqXQHGpUViUgkCHLEWWfNifVlN3xPcA= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18944 + LdsOffsetB_Blk: 84480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18944 + LdsOffsetMetadata_Blk: 84480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MacroTileMXSA: 128 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT191Q3CWl85SU-4agRfSbxDDRYJaTIDmYF_adG8G4wdfNo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27136 + LdsOffsetB_Blk: 92672 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 92672 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MacroTileMXSA: 192 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12kwdCHpnp9d41ufLMEwnWbNNXbmlKzUrCttuc0YSlgSY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19456 + LdsOffsetB_Blk: 84992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19456 + LdsOffsetMetadata_Blk: 84992 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileMXSA: 128 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + ThreadTileMXSA: 16 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25s2SqPqjqQTrutnl4Fjsck3utFxYLnrt2GxRGoMabbq0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileMXSA: 256 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + ThreadTileMXSA: 32 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19--ULatdGkaEyMxugPoO0m7KB92W8g0H0S_IE47lq-mE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MacroTileMXSA: 192 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + ThreadTileMXSA: 24 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25H-6yQcCGxSKwn5gmGJAHA93oK-ekIRJm5JqDOya5ASk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36352 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36352 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MacroTileMXSA: 256 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + ThreadTileMXSA: 32 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT649Xx58paO0aNamHRyhpgRWxjDG0kBNJ5sAnNiMy2USyI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11264 + LdsOffsetB_Blk: 76800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11264 + LdsOffsetMetadata_Blk: 76800 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 10] + MIWaveTileA: 2 + MIWaveTileB: 10 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 + MacroTileMXSA: 64 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 10 + ThreadTileA: 8 + ThreadTileB: 10 + ThreadTileMXSA: 8 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32yn2lSZhgGG39-O0tN9XB-c6Nr5kcbdLKPNrKxqSEhVE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44032 + LdsOffsetB_Blk: 109568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44032 + LdsOffsetMetadata_Blk: 109568 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 64 + MacroTileA: 320 + MacroTileB: 64 + MacroTileMXSA: 320 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + ThreadTileMXSA: 40 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64cx8jpYzIAVvV24Gqg4hoX8QdixxbIDmUlvaleHLuajs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11776 + LdsOffsetB_Blk: 77312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11776 + LdsOffsetMetadata_Blk: 77312 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 12] + MIWaveTileA: 2 + MIWaveTileB: 12 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 384 + MacroTileA: 64 + MacroTileB: 384 + MacroTileMXSA: 64 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 12 + ThreadTileA: 8 + ThreadTileB: 12 + ThreadTileMXSA: 8 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38m0Bm4hH2y_fb6-eo4XZRKb5dkkBhjv6q0Rd-CAzTq-Q= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52736 + LdsOffsetB_Blk: 118272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 118272 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 64 + MacroTileA: 384 + MacroTileB: 64 + MacroTileMXSA: 384 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + ThreadTileMXSA: 48 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64ghBzLJyXgx9oTwgzYUUty2EUDAM9Q1qjBKNRjSDH1Qg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 86528 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 16] + MIWaveTileA: 2 + MIWaveTileB: 16 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 + MacroTileMXSA: 64 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + ThreadTileMXSA: 8 + ThreadTileMXSB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51N7xP8nenZXqX1PoWvPtuxa4rTpP69DwZBgRtHN8axrQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 70144 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 143872 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 147968 + LdsOffsetMetadata: 70144 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [16, 2] + MIWaveTileA: 16 + MIWaveTileB: 2 + MIWaveTileMXSA: 16 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 64 + MacroTileA: 512 + MacroTileB: 64 + MacroTileMXSA: 512 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + ThreadTileMXSA: 64 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Tzx41lFmgkHO5_GDlXlBjYPnuHfWnU6iLujoAD27WHo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19968 + LdsOffsetB_Blk: 85504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 85504 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 320 + MacroTileA: 128 + MacroTileB: 320 + MacroTileMXSA: 128 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT329TK1O9xqAG2cUdkJ_6Jkg-kbMKNtFWKGnhC0gVHbf3c= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44544 + LdsOffsetB_Blk: 110080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 110080 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 128 + MacroTileA: 320 + MacroTileB: 128 + MacroTileMXSA: 320 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Ko14x9Bm17skxIK2J-9eSMhxZf1WQTws5W6lZ4ij_RU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MacroTileMXSA: 128 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + ThreadTileMXSA: 16 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38PfCBP4cNoa67MDHKbRKQKP0baWWAor2QL1_1-ioSnxM= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 53248 + LdsOffsetB_Blk: 122880 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 118784 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 122880 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 128 + MacroTileA: 384 + MacroTileB: 128 + MacroTileMXSA: 384 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + ThreadTileMXSA: 48 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32GHMtx7cclUEZq8HXp3EV83TsDigTCOy4HpQzcwTdNNQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 45568 + LdsOffsetB_Blk: 123904 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 119296 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 45568 + LdsOffsetMetadata_Blk: 123904 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 8] + MIWaveTileA: 10 + MIWaveTileB: 8 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 256 + MacroTileA: 320 + MacroTileB: 256 + MacroTileMXSA: 320 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 8 + ThreadTileA: 40 + ThreadTileB: 8 + ThreadTileMXSA: 40 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25yENsNJL4B9Vb5GlLD7ofBX9eYUJtt3JL_VzKdaPOs5g= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 37376 + LdsOffsetB_Blk: 115712 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 111104 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 115712 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 10] + MIWaveTileA: 8 + MIWaveTileB: 10 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 + MacroTileMXSA: 256 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 10 + ThreadTileA: 32 + ThreadTileB: 10 + ThreadTileMXSA: 32 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64IqbWxZkPcZWy3vA8g9HsDHfQ7n72vJptvmQ79eUQ7kk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 14] + MIWaveTileA: 2 + MIWaveTileB: 14 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 448 + MacroTileA: 64 + MacroTileB: 448 + MacroTileMXSA: 64 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 14 + ThreadTileA: 8 + ThreadTileB: 14 + ThreadTileMXSA: 8 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44JIOkXLbpjhS22SFIEupmgY9bhtzcXu69kfhwBJBO5wo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 61440 + LdsOffsetB_Blk: 131072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 126976 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 131072 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 2] + MIWaveTileA: 14 + MIWaveTileB: 2 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 64 + MacroTileA: 448 + MacroTileB: 64 + MacroTileMXSA: 448 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 2 + ThreadTileA: 56 + ThreadTileB: 2 + ThreadTileMXSA: 56 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12NCUb78G7QzN6ng2b3-8rphTRtJXo20S1KvmwH6k1FLQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 94720 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 95744 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 + MacroTileMXSA: 128 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + ThreadTileMXSA: 16 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44K2rcrzW2gkNghqcoHIxk3lU0xSXGNO6q0t-lzqdm54M= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 61952 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 135680 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 139264 + LdsOffsetMetadata: 61952 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 4] + MIWaveTileA: 14 + MIWaveTileB: 4 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 128 + MacroTileA: 448 + MacroTileB: 128 + MacroTileMXSA: 448 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 4 + ThreadTileA: 56 + ThreadTileB: 4 + ThreadTileMXSA: 56 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT197rqXtfRTrgckooWIHV9QRZkAUvBYDajyR7fyXPSaOfI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 29184 + LdsOffsetB_Blk: 107520 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 102912 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 107520 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 12] + MIWaveTileA: 6 + MIWaveTileB: 12 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MacroTileMXSA: 192 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 12 + ThreadTileA: 24 + ThreadTileB: 12 + ThreadTileMXSA: 24 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38M4Hg58ewW76BBGjsIKCCfd9HK3CXsd22WqnddQrMdIo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 53760 + LdsOffsetB_Blk: 132096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 127488 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 53760 + LdsOffsetMetadata_Blk: 132096 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 6] + MIWaveTileA: 12 + MIWaveTileB: 6 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 192 + MacroTileA: 384 + MacroTileB: 192 + MacroTileMXSA: 384 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 6 + ThreadTileA: 48 + ThreadTileB: 6 + ThreadTileMXSA: 48 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12pSBqfd5dzj78eX1t5EQJzYZ5aHFVtQ9uJHS361sw3qY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17664 + LdsOffsetB_Blk: 50432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 50432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileMXSA: 128 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12lYJO4P9-MC57Obi2kvcZAOwe_1ysdtWgX7-VIBNgQpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18176 + LdsOffsetB_Blk: 50944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 18176 + LdsOffsetMetadata_Blk: 50944 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileMXSA: 128 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25ccxCqQ2dvGclrJYn0QqExpGJSq56GiHUJPL0jzB0qVw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35072 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35072 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileMXSA: 256 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25LD47Rd1FCRkyTzzAeZkhMOPqncN77I3VtqjftUOo3W0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35584 + LdsOffsetB_Blk: 101120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 101120 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MacroTileMXSA: 256 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT252kruXsNK48kysY3byfZ8nkfbVQgYyySEfltsnGOOr7I= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 5 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 160 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36096 + LdsOffsetB_Blk: 101632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 101632 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MacroTileMXSA: 256 + MacroTileMXSB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25K1mXD-Z64cKGl1n64cEhRIvEdarIIYY6qzM6iXFNutU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 9 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 288 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 37120 + LdsOffsetB_Blk: 111104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 106752 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 108800 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 111104 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 18] + MIWaveTileA: 4 + MIWaveTileB: 18 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 18 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MacroTileMXSA: 256 + MacroTileMXSB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 18 + ThreadTileA: 16 + ThreadTileB: 18 + ThreadTileMXSA: 16 + ThreadTileMXSB: 18 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT382wXw8HnHr0540PBKyCMkiG1GnHC8Qq6QTA2UY90u0Us= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52480 + LdsOffsetB_Blk: 118016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52480 + LdsOffsetMetadata_Blk: 118016 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 32 + MacroTileA: 384 + MacroTileB: 32 + MacroTileMXSA: 384 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38iWtaR8v4gfASTnshP-7Zt37fJ-3bIKngSBlzEMd0qgE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52992 + LdsOffsetB_Blk: 118528 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52992 + LdsOffsetMetadata_Blk: 118528 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MacroTileMXSA: 384 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51My4B9dUWEKuVinUMAqyTn-9nVKbOgw4LZNuajpvbgno= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 69888 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 139520 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 143616 + LdsOffsetMetadata: 69888 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 32 + MacroTileA: 512 + MacroTileB: 32 + MacroTileMXSA: 512 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32dv3nH_ug3zWfzXFJEJHJUvkpjXQjAQVml90QMZvwYvs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5376 + LdsOffsetB_Blk: 38144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 36864 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 37120 + LdsOffsetMetadata: 5376 + LdsOffsetMetadata_Blk: 38144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileMXSA: 32 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96VdJ139l5ExTu5vrbSyr9YIDnqGsFY6rZMfGBgYGwyok= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14080 + LdsOffsetB_Blk: 46848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 45056 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 45824 + LdsOffsetMetadata: 14080 + LdsOffsetMetadata_Blk: 46848 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileMXSA: 96 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32pDTSO9Ei8iprYUZVwZ8MVK8XTu8S8ugrlkQJj7C1FJQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 6400 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileMXSA: 32 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96Cj2Z80w1PkWDv8Q4NtOmErGoRYnlYoT1xjNw40Qn6y8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15104 + LdsOffsetB_Blk: 80640 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 15104 + LdsOffsetMetadata_Blk: 80640 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MacroTileMXSA: 96 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT16rkdIx0xJ9AdmxSLTSuPL_HwvXURN3d9JBsT0i2N2r4A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 5 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 160 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1280 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23808 + LdsOffsetB_Blk: 89344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 20480 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 21760 + LdsOffsetMXSB_Blk: 87296 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 89344 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MacroTileMXSA: 160 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT28HX9AX04H5gx2Hnt9CFkwE4Io5BNr8Him7sxPkQ6FxSI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 9 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 288 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2304 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 41216 + LdsOffsetB_Blk: 115200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 36864 + LdsOffsetMXSA_Blk: 110848 + LdsOffsetMXSB: 39168 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 115200 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [18, 4] + MIWaveTileA: 18 + MIWaveTileB: 4 + MIWaveTileMXSA: 18 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 256 + MacroTileA: 288 + MacroTileB: 256 + MacroTileMXSA: 288 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 72 + ThreadTile1: 4 + ThreadTileA: 72 + ThreadTileB: 4 + ThreadTileMXSA: 72 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT323cjBEjqTj-Mu9ZF1RsqJ73WqkJOeA3kpk9e2rgOyJEQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7424 + LdsOffsetB_Blk: 72960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 7424 + LdsOffsetMetadata_Blk: 72960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 384 + MacroTileA: 32 + MacroTileB: 384 + MacroTileMXSA: 32 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32k1L8EqrYflL4kvotLP49gssYPXLs5aDdnaskyCeVRw4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 78080 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 512 + MacroTileA: 32 + MacroTileB: 512 + MacroTileMXSA: 32 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT9680-vsketBKOwATFxdutsK5i_NcnvLOonACpwOS25BUY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MacroTileMXSA: 96 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4SS_MXA32_MXB32_BH_BiasS_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4SS_MXA32_MXB32_BH_BiasS_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 00000000000..8ed8a54a7a9 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4SS_MXA32_MXB32_BH_BiasS_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,16950 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 21 + DataTypeA: 21 + DataTypeAmaxD: 0 + DataTypeB: 21 + DataTypeE: 0 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index01MXSA: 0 + Index01MXSB: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMXSA: [3, 0, 2] + IndexAssignmentsMXSB: [3, 1, 2] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndexUnrollMXSA: 0 + IndexUnrollMXSB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 32 + MXBlockB: 32 + MacDataTypeA: 21 + MacDataTypeB: 21 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + TLUMXSA: false + TLUMXSB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hIsnhFT2L5uw_T8ryWf3mri6PtKpwJ1E97MC3rNtxpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileMXSA: 64 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128AMjoHHGnVSJ1fjvuIJeKPVtejro7lpRL6TixtWnDac= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileMXSA: 128 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + ThreadTileMXSA: 16 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19tVFE7Mochg2Q71HzQS4tIGctt0Td3p2SmmxHjjWSYTc= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MacroTileMXSA: 192 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25YgPR4XPaMfFjkFVdEJNxrCtJ2m9IQRE3ggDN5hyNKfg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 36864 + LdsOffsetB_Blk: 106496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 102400 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 106496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileMXSA: 256 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 256 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + ThreadTileMXSA: 32 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64yR3-UwlktRfU59ACgTfmpcg_Yk6f2Lb1knA4QU-BsnY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9728 + LdsOffsetB_Blk: 42496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 42496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileMXSA: 64 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12ZclQEHdRRO1sBJL3TwidM33yIt-WYnDtDgY9P9JYRX4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 50688 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17920 + LdsOffsetMetadata_Blk: 50688 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileMXSA: 128 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hJRTC6QzyytWkHaKB_tyuSMxvHfrHLHmDLGhNVVcILE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 75776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 75776 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MacroTileMXSA: 64 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19UCUD2vak2YLHvH_Ztg5zXPDJUlX3NzBJWRuGhvKVzu8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26624 + LdsOffsetB_Blk: 92160 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 92160 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MacroTileMXSA: 192 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64sCK37FdA-O3yvoTtVtg3QFjEpjbAv0RNmVuI_Ry968A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10752 + LdsOffsetB_Blk: 76288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10752 + LdsOffsetMetadata_Blk: 76288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MacroTileMXSA: 64 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25Tmym63Cp1OVwmuklAuBYLzHtSt0oUezVbK-tmV8hqYs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35328 + LdsOffsetB_Blk: 100864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 100864 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileMXSA: 256 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Qs31lTSyFaUbDqXQHGpUViUgkCHLEWWfNifVlN3xPcA= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18944 + LdsOffsetB_Blk: 84480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18944 + LdsOffsetMetadata_Blk: 84480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MacroTileMXSA: 128 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT191Q3CWl85SU-4agRfSbxDDRYJaTIDmYF_adG8G4wdfNo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27136 + LdsOffsetB_Blk: 92672 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 92672 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MacroTileMXSA: 192 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12kwdCHpnp9d41ufLMEwnWbNNXbmlKzUrCttuc0YSlgSY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19456 + LdsOffsetB_Blk: 84992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19456 + LdsOffsetMetadata_Blk: 84992 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileMXSA: 128 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + ThreadTileMXSA: 16 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25s2SqPqjqQTrutnl4Fjsck3utFxYLnrt2GxRGoMabbq0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileMXSA: 256 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + ThreadTileMXSA: 32 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19--ULatdGkaEyMxugPoO0m7KB92W8g0H0S_IE47lq-mE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MacroTileMXSA: 192 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + ThreadTileMXSA: 24 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25H-6yQcCGxSKwn5gmGJAHA93oK-ekIRJm5JqDOya5ASk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36352 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36352 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MacroTileMXSA: 256 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + ThreadTileMXSA: 32 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT649Xx58paO0aNamHRyhpgRWxjDG0kBNJ5sAnNiMy2USyI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11264 + LdsOffsetB_Blk: 76800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11264 + LdsOffsetMetadata_Blk: 76800 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 10] + MIWaveTileA: 2 + MIWaveTileB: 10 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 + MacroTileMXSA: 64 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 10 + ThreadTileA: 8 + ThreadTileB: 10 + ThreadTileMXSA: 8 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32yn2lSZhgGG39-O0tN9XB-c6Nr5kcbdLKPNrKxqSEhVE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44032 + LdsOffsetB_Blk: 109568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44032 + LdsOffsetMetadata_Blk: 109568 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 64 + MacroTileA: 320 + MacroTileB: 64 + MacroTileMXSA: 320 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + ThreadTileMXSA: 40 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64cx8jpYzIAVvV24Gqg4hoX8QdixxbIDmUlvaleHLuajs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11776 + LdsOffsetB_Blk: 77312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11776 + LdsOffsetMetadata_Blk: 77312 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 12] + MIWaveTileA: 2 + MIWaveTileB: 12 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 384 + MacroTileA: 64 + MacroTileB: 384 + MacroTileMXSA: 64 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 12 + ThreadTileA: 8 + ThreadTileB: 12 + ThreadTileMXSA: 8 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38m0Bm4hH2y_fb6-eo4XZRKb5dkkBhjv6q0Rd-CAzTq-Q= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52736 + LdsOffsetB_Blk: 118272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 118272 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 64 + MacroTileA: 384 + MacroTileB: 64 + MacroTileMXSA: 384 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + ThreadTileMXSA: 48 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64ghBzLJyXgx9oTwgzYUUty2EUDAM9Q1qjBKNRjSDH1Qg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 86528 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 16] + MIWaveTileA: 2 + MIWaveTileB: 16 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 + MacroTileMXSA: 64 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + ThreadTileMXSA: 8 + ThreadTileMXSB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51N7xP8nenZXqX1PoWvPtuxa4rTpP69DwZBgRtHN8axrQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 70144 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 143872 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 147968 + LdsOffsetMetadata: 70144 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [16, 2] + MIWaveTileA: 16 + MIWaveTileB: 2 + MIWaveTileMXSA: 16 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 64 + MacroTileA: 512 + MacroTileB: 64 + MacroTileMXSA: 512 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + ThreadTileMXSA: 64 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Tzx41lFmgkHO5_GDlXlBjYPnuHfWnU6iLujoAD27WHo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19968 + LdsOffsetB_Blk: 85504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 85504 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 320 + MacroTileA: 128 + MacroTileB: 320 + MacroTileMXSA: 128 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT329TK1O9xqAG2cUdkJ_6Jkg-kbMKNtFWKGnhC0gVHbf3c= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44544 + LdsOffsetB_Blk: 110080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 110080 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 128 + MacroTileA: 320 + MacroTileB: 128 + MacroTileMXSA: 320 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Ko14x9Bm17skxIK2J-9eSMhxZf1WQTws5W6lZ4ij_RU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MacroTileMXSA: 128 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + ThreadTileMXSA: 16 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38PfCBP4cNoa67MDHKbRKQKP0baWWAor2QL1_1-ioSnxM= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 53248 + LdsOffsetB_Blk: 122880 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 118784 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 122880 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 128 + MacroTileA: 384 + MacroTileB: 128 + MacroTileMXSA: 384 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + ThreadTileMXSA: 48 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32GHMtx7cclUEZq8HXp3EV83TsDigTCOy4HpQzcwTdNNQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 45568 + LdsOffsetB_Blk: 123904 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 119296 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 45568 + LdsOffsetMetadata_Blk: 123904 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 8] + MIWaveTileA: 10 + MIWaveTileB: 8 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 256 + MacroTileA: 320 + MacroTileB: 256 + MacroTileMXSA: 320 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 8 + ThreadTileA: 40 + ThreadTileB: 8 + ThreadTileMXSA: 40 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25yENsNJL4B9Vb5GlLD7ofBX9eYUJtt3JL_VzKdaPOs5g= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 37376 + LdsOffsetB_Blk: 115712 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 111104 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 115712 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 10] + MIWaveTileA: 8 + MIWaveTileB: 10 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 + MacroTileMXSA: 256 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 10 + ThreadTileA: 32 + ThreadTileB: 10 + ThreadTileMXSA: 32 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64IqbWxZkPcZWy3vA8g9HsDHfQ7n72vJptvmQ79eUQ7kk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 14] + MIWaveTileA: 2 + MIWaveTileB: 14 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 448 + MacroTileA: 64 + MacroTileB: 448 + MacroTileMXSA: 64 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 14 + ThreadTileA: 8 + ThreadTileB: 14 + ThreadTileMXSA: 8 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44JIOkXLbpjhS22SFIEupmgY9bhtzcXu69kfhwBJBO5wo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 61440 + LdsOffsetB_Blk: 131072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 126976 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 131072 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 2] + MIWaveTileA: 14 + MIWaveTileB: 2 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 64 + MacroTileA: 448 + MacroTileB: 64 + MacroTileMXSA: 448 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 2 + ThreadTileA: 56 + ThreadTileB: 2 + ThreadTileMXSA: 56 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12NCUb78G7QzN6ng2b3-8rphTRtJXo20S1KvmwH6k1FLQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 94720 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 95744 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 + MacroTileMXSA: 128 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + ThreadTileMXSA: 16 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44K2rcrzW2gkNghqcoHIxk3lU0xSXGNO6q0t-lzqdm54M= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 61952 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 135680 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 139264 + LdsOffsetMetadata: 61952 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 4] + MIWaveTileA: 14 + MIWaveTileB: 4 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 128 + MacroTileA: 448 + MacroTileB: 128 + MacroTileMXSA: 448 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 4 + ThreadTileA: 56 + ThreadTileB: 4 + ThreadTileMXSA: 56 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT197rqXtfRTrgckooWIHV9QRZkAUvBYDajyR7fyXPSaOfI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 29184 + LdsOffsetB_Blk: 107520 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 102912 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 107520 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 12] + MIWaveTileA: 6 + MIWaveTileB: 12 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MacroTileMXSA: 192 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 12 + ThreadTileA: 24 + ThreadTileB: 12 + ThreadTileMXSA: 24 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38M4Hg58ewW76BBGjsIKCCfd9HK3CXsd22WqnddQrMdIo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 53760 + LdsOffsetB_Blk: 132096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 127488 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 53760 + LdsOffsetMetadata_Blk: 132096 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 6] + MIWaveTileA: 12 + MIWaveTileB: 6 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 192 + MacroTileA: 384 + MacroTileB: 192 + MacroTileMXSA: 384 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 6 + ThreadTileA: 48 + ThreadTileB: 6 + ThreadTileMXSA: 48 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12pSBqfd5dzj78eX1t5EQJzYZ5aHFVtQ9uJHS361sw3qY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17664 + LdsOffsetB_Blk: 50432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 50432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileMXSA: 128 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12lYJO4P9-MC57Obi2kvcZAOwe_1ysdtWgX7-VIBNgQpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18176 + LdsOffsetB_Blk: 50944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 18176 + LdsOffsetMetadata_Blk: 50944 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileMXSA: 128 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25ccxCqQ2dvGclrJYn0QqExpGJSq56GiHUJPL0jzB0qVw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35072 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35072 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileMXSA: 256 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25LD47Rd1FCRkyTzzAeZkhMOPqncN77I3VtqjftUOo3W0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35584 + LdsOffsetB_Blk: 101120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 101120 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MacroTileMXSA: 256 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT252kruXsNK48kysY3byfZ8nkfbVQgYyySEfltsnGOOr7I= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 5 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 160 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36096 + LdsOffsetB_Blk: 101632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 101632 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MacroTileMXSA: 256 + MacroTileMXSB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25K1mXD-Z64cKGl1n64cEhRIvEdarIIYY6qzM6iXFNutU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 9 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 288 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 37120 + LdsOffsetB_Blk: 111104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 106752 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 108800 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 111104 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 18] + MIWaveTileA: 4 + MIWaveTileB: 18 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 18 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MacroTileMXSA: 256 + MacroTileMXSB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 18 + ThreadTileA: 16 + ThreadTileB: 18 + ThreadTileMXSA: 16 + ThreadTileMXSB: 18 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT382wXw8HnHr0540PBKyCMkiG1GnHC8Qq6QTA2UY90u0Us= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52480 + LdsOffsetB_Blk: 118016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52480 + LdsOffsetMetadata_Blk: 118016 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 32 + MacroTileA: 384 + MacroTileB: 32 + MacroTileMXSA: 384 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38iWtaR8v4gfASTnshP-7Zt37fJ-3bIKngSBlzEMd0qgE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52992 + LdsOffsetB_Blk: 118528 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52992 + LdsOffsetMetadata_Blk: 118528 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MacroTileMXSA: 384 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51My4B9dUWEKuVinUMAqyTn-9nVKbOgw4LZNuajpvbgno= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 69888 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 139520 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 143616 + LdsOffsetMetadata: 69888 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 32 + MacroTileA: 512 + MacroTileB: 32 + MacroTileMXSA: 512 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32dv3nH_ug3zWfzXFJEJHJUvkpjXQjAQVml90QMZvwYvs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5376 + LdsOffsetB_Blk: 38144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 36864 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 37120 + LdsOffsetMetadata: 5376 + LdsOffsetMetadata_Blk: 38144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileMXSA: 32 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96VdJ139l5ExTu5vrbSyr9YIDnqGsFY6rZMfGBgYGwyok= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14080 + LdsOffsetB_Blk: 46848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 45056 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 45824 + LdsOffsetMetadata: 14080 + LdsOffsetMetadata_Blk: 46848 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileMXSA: 96 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32pDTSO9Ei8iprYUZVwZ8MVK8XTu8S8ugrlkQJj7C1FJQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 6400 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileMXSA: 32 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96Cj2Z80w1PkWDv8Q4NtOmErGoRYnlYoT1xjNw40Qn6y8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15104 + LdsOffsetB_Blk: 80640 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 15104 + LdsOffsetMetadata_Blk: 80640 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MacroTileMXSA: 96 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT16rkdIx0xJ9AdmxSLTSuPL_HwvXURN3d9JBsT0i2N2r4A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 5 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 160 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1280 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23808 + LdsOffsetB_Blk: 89344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 20480 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 21760 + LdsOffsetMXSB_Blk: 87296 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 89344 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MacroTileMXSA: 160 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT28HX9AX04H5gx2Hnt9CFkwE4Io5BNr8Him7sxPkQ6FxSI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 9 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 288 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2304 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 41216 + LdsOffsetB_Blk: 115200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 36864 + LdsOffsetMXSA_Blk: 110848 + LdsOffsetMXSB: 39168 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 115200 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [18, 4] + MIWaveTileA: 18 + MIWaveTileB: 4 + MIWaveTileMXSA: 18 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 256 + MacroTileA: 288 + MacroTileB: 256 + MacroTileMXSA: 288 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 72 + ThreadTile1: 4 + ThreadTileA: 72 + ThreadTileB: 4 + ThreadTileMXSA: 72 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT323cjBEjqTj-Mu9ZF1RsqJ73WqkJOeA3kpk9e2rgOyJEQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7424 + LdsOffsetB_Blk: 72960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 7424 + LdsOffsetMetadata_Blk: 72960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 384 + MacroTileA: 32 + MacroTileB: 384 + MacroTileMXSA: 32 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32k1L8EqrYflL4kvotLP49gssYPXLs5aDdnaskyCeVRw4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 78080 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 512 + MacroTileA: 32 + MacroTileB: 512 + MacroTileMXSA: 32 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT9680-vsketBKOwATFxdutsK5i_NcnvLOonACpwOS25BUY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MacroTileMXSA: 96 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 00000000000..030ac8ab77e --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,16950 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 21 + DataTypeA: 21 + DataTypeAmaxD: 0 + DataTypeB: 21 + DataTypeE: 7 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index01MXSA: 0 + Index01MXSB: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMXSA: [3, 0, 2] + IndexAssignmentsMXSB: [3, 1, 2] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndexUnrollMXSA: 0 + IndexUnrollMXSB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 32 + MXBlockB: 32 + MacDataTypeA: 21 + MacDataTypeB: 21 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + TLUMXSA: false + TLUMXSB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hIsnhFT2L5uw_T8ryWf3mri6PtKpwJ1E97MC3rNtxpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileMXSA: 64 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128AMjoHHGnVSJ1fjvuIJeKPVtejro7lpRL6TixtWnDac= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileMXSA: 128 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + ThreadTileMXSA: 16 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19tVFE7Mochg2Q71HzQS4tIGctt0Td3p2SmmxHjjWSYTc= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MacroTileMXSA: 192 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25YgPR4XPaMfFjkFVdEJNxrCtJ2m9IQRE3ggDN5hyNKfg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 36864 + LdsOffsetB_Blk: 106496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 102400 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 106496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileMXSA: 256 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 256 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + ThreadTileMXSA: 32 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64yR3-UwlktRfU59ACgTfmpcg_Yk6f2Lb1knA4QU-BsnY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9728 + LdsOffsetB_Blk: 42496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 42496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileMXSA: 64 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12ZclQEHdRRO1sBJL3TwidM33yIt-WYnDtDgY9P9JYRX4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 50688 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17920 + LdsOffsetMetadata_Blk: 50688 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileMXSA: 128 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hJRTC6QzyytWkHaKB_tyuSMxvHfrHLHmDLGhNVVcILE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 75776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 75776 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MacroTileMXSA: 64 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19UCUD2vak2YLHvH_Ztg5zXPDJUlX3NzBJWRuGhvKVzu8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26624 + LdsOffsetB_Blk: 92160 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 92160 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MacroTileMXSA: 192 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64sCK37FdA-O3yvoTtVtg3QFjEpjbAv0RNmVuI_Ry968A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10752 + LdsOffsetB_Blk: 76288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10752 + LdsOffsetMetadata_Blk: 76288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MacroTileMXSA: 64 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25Tmym63Cp1OVwmuklAuBYLzHtSt0oUezVbK-tmV8hqYs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35328 + LdsOffsetB_Blk: 100864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 100864 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileMXSA: 256 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Qs31lTSyFaUbDqXQHGpUViUgkCHLEWWfNifVlN3xPcA= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18944 + LdsOffsetB_Blk: 84480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18944 + LdsOffsetMetadata_Blk: 84480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MacroTileMXSA: 128 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT191Q3CWl85SU-4agRfSbxDDRYJaTIDmYF_adG8G4wdfNo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27136 + LdsOffsetB_Blk: 92672 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 92672 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MacroTileMXSA: 192 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12kwdCHpnp9d41ufLMEwnWbNNXbmlKzUrCttuc0YSlgSY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19456 + LdsOffsetB_Blk: 84992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19456 + LdsOffsetMetadata_Blk: 84992 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileMXSA: 128 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + ThreadTileMXSA: 16 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25s2SqPqjqQTrutnl4Fjsck3utFxYLnrt2GxRGoMabbq0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileMXSA: 256 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + ThreadTileMXSA: 32 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19--ULatdGkaEyMxugPoO0m7KB92W8g0H0S_IE47lq-mE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MacroTileMXSA: 192 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + ThreadTileMXSA: 24 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25H-6yQcCGxSKwn5gmGJAHA93oK-ekIRJm5JqDOya5ASk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36352 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36352 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MacroTileMXSA: 256 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + ThreadTileMXSA: 32 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT649Xx58paO0aNamHRyhpgRWxjDG0kBNJ5sAnNiMy2USyI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11264 + LdsOffsetB_Blk: 76800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11264 + LdsOffsetMetadata_Blk: 76800 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 10] + MIWaveTileA: 2 + MIWaveTileB: 10 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 + MacroTileMXSA: 64 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 10 + ThreadTileA: 8 + ThreadTileB: 10 + ThreadTileMXSA: 8 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32yn2lSZhgGG39-O0tN9XB-c6Nr5kcbdLKPNrKxqSEhVE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44032 + LdsOffsetB_Blk: 109568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44032 + LdsOffsetMetadata_Blk: 109568 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 64 + MacroTileA: 320 + MacroTileB: 64 + MacroTileMXSA: 320 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + ThreadTileMXSA: 40 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64cx8jpYzIAVvV24Gqg4hoX8QdixxbIDmUlvaleHLuajs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11776 + LdsOffsetB_Blk: 77312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11776 + LdsOffsetMetadata_Blk: 77312 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 12] + MIWaveTileA: 2 + MIWaveTileB: 12 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 384 + MacroTileA: 64 + MacroTileB: 384 + MacroTileMXSA: 64 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 12 + ThreadTileA: 8 + ThreadTileB: 12 + ThreadTileMXSA: 8 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38m0Bm4hH2y_fb6-eo4XZRKb5dkkBhjv6q0Rd-CAzTq-Q= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52736 + LdsOffsetB_Blk: 118272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 118272 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 64 + MacroTileA: 384 + MacroTileB: 64 + MacroTileMXSA: 384 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + ThreadTileMXSA: 48 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64ghBzLJyXgx9oTwgzYUUty2EUDAM9Q1qjBKNRjSDH1Qg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 86528 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 16] + MIWaveTileA: 2 + MIWaveTileB: 16 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 + MacroTileMXSA: 64 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + ThreadTileMXSA: 8 + ThreadTileMXSB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51N7xP8nenZXqX1PoWvPtuxa4rTpP69DwZBgRtHN8axrQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 70144 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 143872 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 147968 + LdsOffsetMetadata: 70144 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [16, 2] + MIWaveTileA: 16 + MIWaveTileB: 2 + MIWaveTileMXSA: 16 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 64 + MacroTileA: 512 + MacroTileB: 64 + MacroTileMXSA: 512 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + ThreadTileMXSA: 64 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Tzx41lFmgkHO5_GDlXlBjYPnuHfWnU6iLujoAD27WHo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19968 + LdsOffsetB_Blk: 85504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 85504 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 320 + MacroTileA: 128 + MacroTileB: 320 + MacroTileMXSA: 128 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT329TK1O9xqAG2cUdkJ_6Jkg-kbMKNtFWKGnhC0gVHbf3c= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44544 + LdsOffsetB_Blk: 110080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 110080 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 128 + MacroTileA: 320 + MacroTileB: 128 + MacroTileMXSA: 320 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Ko14x9Bm17skxIK2J-9eSMhxZf1WQTws5W6lZ4ij_RU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MacroTileMXSA: 128 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + ThreadTileMXSA: 16 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38PfCBP4cNoa67MDHKbRKQKP0baWWAor2QL1_1-ioSnxM= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 53248 + LdsOffsetB_Blk: 122880 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 118784 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 122880 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 128 + MacroTileA: 384 + MacroTileB: 128 + MacroTileMXSA: 384 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + ThreadTileMXSA: 48 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32GHMtx7cclUEZq8HXp3EV83TsDigTCOy4HpQzcwTdNNQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 45568 + LdsOffsetB_Blk: 123904 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 119296 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 45568 + LdsOffsetMetadata_Blk: 123904 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 8] + MIWaveTileA: 10 + MIWaveTileB: 8 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 256 + MacroTileA: 320 + MacroTileB: 256 + MacroTileMXSA: 320 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 8 + ThreadTileA: 40 + ThreadTileB: 8 + ThreadTileMXSA: 40 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25yENsNJL4B9Vb5GlLD7ofBX9eYUJtt3JL_VzKdaPOs5g= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 37376 + LdsOffsetB_Blk: 115712 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 111104 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 115712 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 10] + MIWaveTileA: 8 + MIWaveTileB: 10 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 + MacroTileMXSA: 256 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 10 + ThreadTileA: 32 + ThreadTileB: 10 + ThreadTileMXSA: 32 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64IqbWxZkPcZWy3vA8g9HsDHfQ7n72vJptvmQ79eUQ7kk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 14] + MIWaveTileA: 2 + MIWaveTileB: 14 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 448 + MacroTileA: 64 + MacroTileB: 448 + MacroTileMXSA: 64 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 14 + ThreadTileA: 8 + ThreadTileB: 14 + ThreadTileMXSA: 8 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44JIOkXLbpjhS22SFIEupmgY9bhtzcXu69kfhwBJBO5wo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 61440 + LdsOffsetB_Blk: 131072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 126976 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 131072 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 2] + MIWaveTileA: 14 + MIWaveTileB: 2 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 64 + MacroTileA: 448 + MacroTileB: 64 + MacroTileMXSA: 448 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 2 + ThreadTileA: 56 + ThreadTileB: 2 + ThreadTileMXSA: 56 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12NCUb78G7QzN6ng2b3-8rphTRtJXo20S1KvmwH6k1FLQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 94720 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 95744 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 + MacroTileMXSA: 128 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + ThreadTileMXSA: 16 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44K2rcrzW2gkNghqcoHIxk3lU0xSXGNO6q0t-lzqdm54M= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 61952 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 135680 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 139264 + LdsOffsetMetadata: 61952 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 4] + MIWaveTileA: 14 + MIWaveTileB: 4 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 128 + MacroTileA: 448 + MacroTileB: 128 + MacroTileMXSA: 448 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 4 + ThreadTileA: 56 + ThreadTileB: 4 + ThreadTileMXSA: 56 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT197rqXtfRTrgckooWIHV9QRZkAUvBYDajyR7fyXPSaOfI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 29184 + LdsOffsetB_Blk: 107520 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 102912 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 107520 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 12] + MIWaveTileA: 6 + MIWaveTileB: 12 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MacroTileMXSA: 192 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 12 + ThreadTileA: 24 + ThreadTileB: 12 + ThreadTileMXSA: 24 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38M4Hg58ewW76BBGjsIKCCfd9HK3CXsd22WqnddQrMdIo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 53760 + LdsOffsetB_Blk: 132096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 127488 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 53760 + LdsOffsetMetadata_Blk: 132096 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 6] + MIWaveTileA: 12 + MIWaveTileB: 6 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 192 + MacroTileA: 384 + MacroTileB: 192 + MacroTileMXSA: 384 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 6 + ThreadTileA: 48 + ThreadTileB: 6 + ThreadTileMXSA: 48 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12pSBqfd5dzj78eX1t5EQJzYZ5aHFVtQ9uJHS361sw3qY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17664 + LdsOffsetB_Blk: 50432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 50432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileMXSA: 128 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12lYJO4P9-MC57Obi2kvcZAOwe_1ysdtWgX7-VIBNgQpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18176 + LdsOffsetB_Blk: 50944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 18176 + LdsOffsetMetadata_Blk: 50944 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileMXSA: 128 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25ccxCqQ2dvGclrJYn0QqExpGJSq56GiHUJPL0jzB0qVw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35072 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35072 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileMXSA: 256 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25LD47Rd1FCRkyTzzAeZkhMOPqncN77I3VtqjftUOo3W0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35584 + LdsOffsetB_Blk: 101120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 101120 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MacroTileMXSA: 256 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT252kruXsNK48kysY3byfZ8nkfbVQgYyySEfltsnGOOr7I= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 5 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 160 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36096 + LdsOffsetB_Blk: 101632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 101632 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MacroTileMXSA: 256 + MacroTileMXSB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25K1mXD-Z64cKGl1n64cEhRIvEdarIIYY6qzM6iXFNutU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 9 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 288 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 37120 + LdsOffsetB_Blk: 111104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 106752 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 108800 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 111104 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 18] + MIWaveTileA: 4 + MIWaveTileB: 18 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 18 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MacroTileMXSA: 256 + MacroTileMXSB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 18 + ThreadTileA: 16 + ThreadTileB: 18 + ThreadTileMXSA: 16 + ThreadTileMXSB: 18 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT382wXw8HnHr0540PBKyCMkiG1GnHC8Qq6QTA2UY90u0Us= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52480 + LdsOffsetB_Blk: 118016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52480 + LdsOffsetMetadata_Blk: 118016 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 32 + MacroTileA: 384 + MacroTileB: 32 + MacroTileMXSA: 384 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38iWtaR8v4gfASTnshP-7Zt37fJ-3bIKngSBlzEMd0qgE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52992 + LdsOffsetB_Blk: 118528 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52992 + LdsOffsetMetadata_Blk: 118528 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MacroTileMXSA: 384 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51My4B9dUWEKuVinUMAqyTn-9nVKbOgw4LZNuajpvbgno= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 69888 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 139520 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 143616 + LdsOffsetMetadata: 69888 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 32 + MacroTileA: 512 + MacroTileB: 32 + MacroTileMXSA: 512 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32dv3nH_ug3zWfzXFJEJHJUvkpjXQjAQVml90QMZvwYvs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5376 + LdsOffsetB_Blk: 38144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 36864 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 37120 + LdsOffsetMetadata: 5376 + LdsOffsetMetadata_Blk: 38144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileMXSA: 32 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96VdJ139l5ExTu5vrbSyr9YIDnqGsFY6rZMfGBgYGwyok= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14080 + LdsOffsetB_Blk: 46848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 45056 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 45824 + LdsOffsetMetadata: 14080 + LdsOffsetMetadata_Blk: 46848 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileMXSA: 96 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32pDTSO9Ei8iprYUZVwZ8MVK8XTu8S8ugrlkQJj7C1FJQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 6400 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileMXSA: 32 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96Cj2Z80w1PkWDv8Q4NtOmErGoRYnlYoT1xjNw40Qn6y8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15104 + LdsOffsetB_Blk: 80640 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 15104 + LdsOffsetMetadata_Blk: 80640 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MacroTileMXSA: 96 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT16rkdIx0xJ9AdmxSLTSuPL_HwvXURN3d9JBsT0i2N2r4A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 5 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 160 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1280 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23808 + LdsOffsetB_Blk: 89344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 20480 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 21760 + LdsOffsetMXSB_Blk: 87296 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 89344 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MacroTileMXSA: 160 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT28HX9AX04H5gx2Hnt9CFkwE4Io5BNr8Him7sxPkQ6FxSI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 9 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 288 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2304 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 41216 + LdsOffsetB_Blk: 115200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 36864 + LdsOffsetMXSA_Blk: 110848 + LdsOffsetMXSB: 39168 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 115200 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [18, 4] + MIWaveTileA: 18 + MIWaveTileB: 4 + MIWaveTileMXSA: 18 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 256 + MacroTileA: 288 + MacroTileB: 256 + MacroTileMXSA: 288 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 72 + ThreadTile1: 4 + ThreadTileA: 72 + ThreadTileB: 4 + ThreadTileMXSA: 72 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT323cjBEjqTj-Mu9ZF1RsqJ73WqkJOeA3kpk9e2rgOyJEQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7424 + LdsOffsetB_Blk: 72960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 7424 + LdsOffsetMetadata_Blk: 72960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 384 + MacroTileA: 32 + MacroTileB: 384 + MacroTileMXSA: 32 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32k1L8EqrYflL4kvotLP49gssYPXLs5aDdnaskyCeVRw4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 78080 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 512 + MacroTileA: 32 + MacroTileB: 512 + MacroTileMXSA: 32 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT9680-vsketBKOwATFxdutsK5i_NcnvLOonACpwOS25BUY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MacroTileMXSA: 96 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4HS_MXA32_MXB32_BH_BiasSH_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4HS_MXA32_MXB32_BH_BiasSH_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 00000000000..ea0f611ec12 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4HS_MXA32_MXB32_BH_BiasSH_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,16950 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 4] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 21 + DataTypeA: 21 + DataTypeAmaxD: 0 + DataTypeB: 21 + DataTypeE: 4 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index01MXSA: 0 + Index01MXSB: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMXSA: [3, 0, 2] + IndexAssignmentsMXSB: [3, 1, 2] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndexUnrollMXSA: 0 + IndexUnrollMXSB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 32 + MXBlockB: 32 + MacDataTypeA: 21 + MacDataTypeB: 21 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + TLUMXSA: false + TLUMXSB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hIsnhFT2L5uw_T8ryWf3mri6PtKpwJ1E97MC3rNtxpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileMXSA: 64 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128AMjoHHGnVSJ1fjvuIJeKPVtejro7lpRL6TixtWnDac= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileMXSA: 128 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + ThreadTileMXSA: 16 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19tVFE7Mochg2Q71HzQS4tIGctt0Td3p2SmmxHjjWSYTc= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MacroTileMXSA: 192 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25YgPR4XPaMfFjkFVdEJNxrCtJ2m9IQRE3ggDN5hyNKfg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 36864 + LdsOffsetB_Blk: 106496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 102400 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 106496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileMXSA: 256 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 256 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + ThreadTileMXSA: 32 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64yR3-UwlktRfU59ACgTfmpcg_Yk6f2Lb1knA4QU-BsnY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9728 + LdsOffsetB_Blk: 42496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 42496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileMXSA: 64 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12ZclQEHdRRO1sBJL3TwidM33yIt-WYnDtDgY9P9JYRX4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 50688 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17920 + LdsOffsetMetadata_Blk: 50688 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileMXSA: 128 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hJRTC6QzyytWkHaKB_tyuSMxvHfrHLHmDLGhNVVcILE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 75776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 75776 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MacroTileMXSA: 64 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19UCUD2vak2YLHvH_Ztg5zXPDJUlX3NzBJWRuGhvKVzu8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26624 + LdsOffsetB_Blk: 92160 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 92160 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MacroTileMXSA: 192 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64sCK37FdA-O3yvoTtVtg3QFjEpjbAv0RNmVuI_Ry968A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10752 + LdsOffsetB_Blk: 76288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10752 + LdsOffsetMetadata_Blk: 76288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MacroTileMXSA: 64 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25Tmym63Cp1OVwmuklAuBYLzHtSt0oUezVbK-tmV8hqYs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35328 + LdsOffsetB_Blk: 100864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 100864 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileMXSA: 256 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Qs31lTSyFaUbDqXQHGpUViUgkCHLEWWfNifVlN3xPcA= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18944 + LdsOffsetB_Blk: 84480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18944 + LdsOffsetMetadata_Blk: 84480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MacroTileMXSA: 128 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT191Q3CWl85SU-4agRfSbxDDRYJaTIDmYF_adG8G4wdfNo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27136 + LdsOffsetB_Blk: 92672 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 92672 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MacroTileMXSA: 192 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12kwdCHpnp9d41ufLMEwnWbNNXbmlKzUrCttuc0YSlgSY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19456 + LdsOffsetB_Blk: 84992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19456 + LdsOffsetMetadata_Blk: 84992 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileMXSA: 128 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + ThreadTileMXSA: 16 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25s2SqPqjqQTrutnl4Fjsck3utFxYLnrt2GxRGoMabbq0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileMXSA: 256 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + ThreadTileMXSA: 32 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19--ULatdGkaEyMxugPoO0m7KB92W8g0H0S_IE47lq-mE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MacroTileMXSA: 192 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + ThreadTileMXSA: 24 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25H-6yQcCGxSKwn5gmGJAHA93oK-ekIRJm5JqDOya5ASk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36352 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36352 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MacroTileMXSA: 256 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + ThreadTileMXSA: 32 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT649Xx58paO0aNamHRyhpgRWxjDG0kBNJ5sAnNiMy2USyI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11264 + LdsOffsetB_Blk: 76800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11264 + LdsOffsetMetadata_Blk: 76800 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 10] + MIWaveTileA: 2 + MIWaveTileB: 10 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 + MacroTileMXSA: 64 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 10 + ThreadTileA: 8 + ThreadTileB: 10 + ThreadTileMXSA: 8 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32yn2lSZhgGG39-O0tN9XB-c6Nr5kcbdLKPNrKxqSEhVE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44032 + LdsOffsetB_Blk: 109568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44032 + LdsOffsetMetadata_Blk: 109568 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 64 + MacroTileA: 320 + MacroTileB: 64 + MacroTileMXSA: 320 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + ThreadTileMXSA: 40 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64cx8jpYzIAVvV24Gqg4hoX8QdixxbIDmUlvaleHLuajs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11776 + LdsOffsetB_Blk: 77312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11776 + LdsOffsetMetadata_Blk: 77312 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 12] + MIWaveTileA: 2 + MIWaveTileB: 12 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 384 + MacroTileA: 64 + MacroTileB: 384 + MacroTileMXSA: 64 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 12 + ThreadTileA: 8 + ThreadTileB: 12 + ThreadTileMXSA: 8 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38m0Bm4hH2y_fb6-eo4XZRKb5dkkBhjv6q0Rd-CAzTq-Q= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52736 + LdsOffsetB_Blk: 118272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 118272 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 64 + MacroTileA: 384 + MacroTileB: 64 + MacroTileMXSA: 384 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + ThreadTileMXSA: 48 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64ghBzLJyXgx9oTwgzYUUty2EUDAM9Q1qjBKNRjSDH1Qg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 86528 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 16] + MIWaveTileA: 2 + MIWaveTileB: 16 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 + MacroTileMXSA: 64 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + ThreadTileMXSA: 8 + ThreadTileMXSB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51N7xP8nenZXqX1PoWvPtuxa4rTpP69DwZBgRtHN8axrQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 70144 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 143872 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 147968 + LdsOffsetMetadata: 70144 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [16, 2] + MIWaveTileA: 16 + MIWaveTileB: 2 + MIWaveTileMXSA: 16 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 64 + MacroTileA: 512 + MacroTileB: 64 + MacroTileMXSA: 512 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + ThreadTileMXSA: 64 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Tzx41lFmgkHO5_GDlXlBjYPnuHfWnU6iLujoAD27WHo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19968 + LdsOffsetB_Blk: 85504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 85504 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 320 + MacroTileA: 128 + MacroTileB: 320 + MacroTileMXSA: 128 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT329TK1O9xqAG2cUdkJ_6Jkg-kbMKNtFWKGnhC0gVHbf3c= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44544 + LdsOffsetB_Blk: 110080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 110080 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 128 + MacroTileA: 320 + MacroTileB: 128 + MacroTileMXSA: 320 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Ko14x9Bm17skxIK2J-9eSMhxZf1WQTws5W6lZ4ij_RU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MacroTileMXSA: 128 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + ThreadTileMXSA: 16 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38PfCBP4cNoa67MDHKbRKQKP0baWWAor2QL1_1-ioSnxM= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 53248 + LdsOffsetB_Blk: 122880 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 118784 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 122880 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 128 + MacroTileA: 384 + MacroTileB: 128 + MacroTileMXSA: 384 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + ThreadTileMXSA: 48 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32GHMtx7cclUEZq8HXp3EV83TsDigTCOy4HpQzcwTdNNQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 45568 + LdsOffsetB_Blk: 123904 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 119296 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 45568 + LdsOffsetMetadata_Blk: 123904 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 8] + MIWaveTileA: 10 + MIWaveTileB: 8 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 256 + MacroTileA: 320 + MacroTileB: 256 + MacroTileMXSA: 320 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 8 + ThreadTileA: 40 + ThreadTileB: 8 + ThreadTileMXSA: 40 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25yENsNJL4B9Vb5GlLD7ofBX9eYUJtt3JL_VzKdaPOs5g= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 37376 + LdsOffsetB_Blk: 115712 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 111104 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 115712 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 10] + MIWaveTileA: 8 + MIWaveTileB: 10 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 + MacroTileMXSA: 256 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 10 + ThreadTileA: 32 + ThreadTileB: 10 + ThreadTileMXSA: 32 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64IqbWxZkPcZWy3vA8g9HsDHfQ7n72vJptvmQ79eUQ7kk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 14] + MIWaveTileA: 2 + MIWaveTileB: 14 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 448 + MacroTileA: 64 + MacroTileB: 448 + MacroTileMXSA: 64 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 14 + ThreadTileA: 8 + ThreadTileB: 14 + ThreadTileMXSA: 8 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44JIOkXLbpjhS22SFIEupmgY9bhtzcXu69kfhwBJBO5wo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 61440 + LdsOffsetB_Blk: 131072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 126976 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 131072 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 2] + MIWaveTileA: 14 + MIWaveTileB: 2 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 64 + MacroTileA: 448 + MacroTileB: 64 + MacroTileMXSA: 448 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 2 + ThreadTileA: 56 + ThreadTileB: 2 + ThreadTileMXSA: 56 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12NCUb78G7QzN6ng2b3-8rphTRtJXo20S1KvmwH6k1FLQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 94720 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 95744 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 + MacroTileMXSA: 128 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + ThreadTileMXSA: 16 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44K2rcrzW2gkNghqcoHIxk3lU0xSXGNO6q0t-lzqdm54M= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 61952 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 135680 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 139264 + LdsOffsetMetadata: 61952 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 4] + MIWaveTileA: 14 + MIWaveTileB: 4 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 128 + MacroTileA: 448 + MacroTileB: 128 + MacroTileMXSA: 448 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 4 + ThreadTileA: 56 + ThreadTileB: 4 + ThreadTileMXSA: 56 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT197rqXtfRTrgckooWIHV9QRZkAUvBYDajyR7fyXPSaOfI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 29184 + LdsOffsetB_Blk: 107520 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 102912 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 107520 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 12] + MIWaveTileA: 6 + MIWaveTileB: 12 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MacroTileMXSA: 192 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 12 + ThreadTileA: 24 + ThreadTileB: 12 + ThreadTileMXSA: 24 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38M4Hg58ewW76BBGjsIKCCfd9HK3CXsd22WqnddQrMdIo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 53760 + LdsOffsetB_Blk: 132096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 127488 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 53760 + LdsOffsetMetadata_Blk: 132096 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 6] + MIWaveTileA: 12 + MIWaveTileB: 6 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 192 + MacroTileA: 384 + MacroTileB: 192 + MacroTileMXSA: 384 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 6 + ThreadTileA: 48 + ThreadTileB: 6 + ThreadTileMXSA: 48 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12pSBqfd5dzj78eX1t5EQJzYZ5aHFVtQ9uJHS361sw3qY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17664 + LdsOffsetB_Blk: 50432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 50432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileMXSA: 128 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12lYJO4P9-MC57Obi2kvcZAOwe_1ysdtWgX7-VIBNgQpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18176 + LdsOffsetB_Blk: 50944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 18176 + LdsOffsetMetadata_Blk: 50944 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileMXSA: 128 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25ccxCqQ2dvGclrJYn0QqExpGJSq56GiHUJPL0jzB0qVw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35072 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35072 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileMXSA: 256 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25LD47Rd1FCRkyTzzAeZkhMOPqncN77I3VtqjftUOo3W0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35584 + LdsOffsetB_Blk: 101120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 101120 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MacroTileMXSA: 256 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT252kruXsNK48kysY3byfZ8nkfbVQgYyySEfltsnGOOr7I= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 5 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 160 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36096 + LdsOffsetB_Blk: 101632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 101632 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MacroTileMXSA: 256 + MacroTileMXSB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25K1mXD-Z64cKGl1n64cEhRIvEdarIIYY6qzM6iXFNutU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 9 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 288 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 37120 + LdsOffsetB_Blk: 111104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 106752 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 108800 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 111104 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 18] + MIWaveTileA: 4 + MIWaveTileB: 18 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 18 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MacroTileMXSA: 256 + MacroTileMXSB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 18 + ThreadTileA: 16 + ThreadTileB: 18 + ThreadTileMXSA: 16 + ThreadTileMXSB: 18 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT382wXw8HnHr0540PBKyCMkiG1GnHC8Qq6QTA2UY90u0Us= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52480 + LdsOffsetB_Blk: 118016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52480 + LdsOffsetMetadata_Blk: 118016 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 32 + MacroTileA: 384 + MacroTileB: 32 + MacroTileMXSA: 384 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38iWtaR8v4gfASTnshP-7Zt37fJ-3bIKngSBlzEMd0qgE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52992 + LdsOffsetB_Blk: 118528 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52992 + LdsOffsetMetadata_Blk: 118528 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MacroTileMXSA: 384 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51My4B9dUWEKuVinUMAqyTn-9nVKbOgw4LZNuajpvbgno= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 69888 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 139520 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 143616 + LdsOffsetMetadata: 69888 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 32 + MacroTileA: 512 + MacroTileB: 32 + MacroTileMXSA: 512 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32dv3nH_ug3zWfzXFJEJHJUvkpjXQjAQVml90QMZvwYvs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5376 + LdsOffsetB_Blk: 38144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 36864 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 37120 + LdsOffsetMetadata: 5376 + LdsOffsetMetadata_Blk: 38144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileMXSA: 32 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96VdJ139l5ExTu5vrbSyr9YIDnqGsFY6rZMfGBgYGwyok= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14080 + LdsOffsetB_Blk: 46848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 45056 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 45824 + LdsOffsetMetadata: 14080 + LdsOffsetMetadata_Blk: 46848 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileMXSA: 96 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32pDTSO9Ei8iprYUZVwZ8MVK8XTu8S8ugrlkQJj7C1FJQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 6400 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileMXSA: 32 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96Cj2Z80w1PkWDv8Q4NtOmErGoRYnlYoT1xjNw40Qn6y8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15104 + LdsOffsetB_Blk: 80640 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 15104 + LdsOffsetMetadata_Blk: 80640 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MacroTileMXSA: 96 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT16rkdIx0xJ9AdmxSLTSuPL_HwvXURN3d9JBsT0i2N2r4A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 5 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 160 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1280 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23808 + LdsOffsetB_Blk: 89344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 20480 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 21760 + LdsOffsetMXSB_Blk: 87296 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 89344 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MacroTileMXSA: 160 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT28HX9AX04H5gx2Hnt9CFkwE4Io5BNr8Him7sxPkQ6FxSI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 9 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 288 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2304 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 41216 + LdsOffsetB_Blk: 115200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 36864 + LdsOffsetMXSA_Blk: 110848 + LdsOffsetMXSB: 39168 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 115200 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [18, 4] + MIWaveTileA: 18 + MIWaveTileB: 4 + MIWaveTileMXSA: 18 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 256 + MacroTileA: 288 + MacroTileB: 256 + MacroTileMXSA: 288 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 72 + ThreadTile1: 4 + ThreadTileA: 72 + ThreadTileB: 4 + ThreadTileMXSA: 72 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT323cjBEjqTj-Mu9ZF1RsqJ73WqkJOeA3kpk9e2rgOyJEQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7424 + LdsOffsetB_Blk: 72960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 7424 + LdsOffsetMetadata_Blk: 72960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 384 + MacroTileA: 32 + MacroTileB: 384 + MacroTileMXSA: 32 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32k1L8EqrYflL4kvotLP49gssYPXLs5aDdnaskyCeVRw4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 78080 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 512 + MacroTileA: 32 + MacroTileB: 512 + MacroTileMXSA: 32 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT9680-vsketBKOwATFxdutsK5i_NcnvLOonACpwOS25BUY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MacroTileMXSA: 96 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4SS_MXA32_MXB32_BH_BiasS_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4SS_MXA32_MXB32_BH_BiasS_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 00000000000..2d31944095d --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4SS_MXA32_MXB32_BH_BiasS_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,16950 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 21 + DataTypeA: 21 + DataTypeAmaxD: 0 + DataTypeB: 21 + DataTypeE: 0 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index01MXSA: 0 + Index01MXSB: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMXSA: [3, 0, 2] + IndexAssignmentsMXSB: [3, 1, 2] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndexUnrollMXSA: 0 + IndexUnrollMXSB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 32 + MXBlockB: 32 + MacDataTypeA: 21 + MacDataTypeB: 21 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + TLUMXSA: false + TLUMXSB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hIsnhFT2L5uw_T8ryWf3mri6PtKpwJ1E97MC3rNtxpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileMXSA: 64 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128AMjoHHGnVSJ1fjvuIJeKPVtejro7lpRL6TixtWnDac= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileMXSA: 128 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + ThreadTileMXSA: 16 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19tVFE7Mochg2Q71HzQS4tIGctt0Td3p2SmmxHjjWSYTc= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MacroTileMXSA: 192 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25YgPR4XPaMfFjkFVdEJNxrCtJ2m9IQRE3ggDN5hyNKfg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 36864 + LdsOffsetB_Blk: 106496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 102400 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 106496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileMXSA: 256 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 256 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + ThreadTileMXSA: 32 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64yR3-UwlktRfU59ACgTfmpcg_Yk6f2Lb1knA4QU-BsnY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9728 + LdsOffsetB_Blk: 42496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 42496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileMXSA: 64 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12ZclQEHdRRO1sBJL3TwidM33yIt-WYnDtDgY9P9JYRX4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 50688 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17920 + LdsOffsetMetadata_Blk: 50688 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileMXSA: 128 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hJRTC6QzyytWkHaKB_tyuSMxvHfrHLHmDLGhNVVcILE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 75776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 75776 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MacroTileMXSA: 64 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19UCUD2vak2YLHvH_Ztg5zXPDJUlX3NzBJWRuGhvKVzu8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26624 + LdsOffsetB_Blk: 92160 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 92160 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MacroTileMXSA: 192 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64sCK37FdA-O3yvoTtVtg3QFjEpjbAv0RNmVuI_Ry968A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10752 + LdsOffsetB_Blk: 76288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10752 + LdsOffsetMetadata_Blk: 76288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MacroTileMXSA: 64 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25Tmym63Cp1OVwmuklAuBYLzHtSt0oUezVbK-tmV8hqYs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35328 + LdsOffsetB_Blk: 100864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 100864 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileMXSA: 256 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Qs31lTSyFaUbDqXQHGpUViUgkCHLEWWfNifVlN3xPcA= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18944 + LdsOffsetB_Blk: 84480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18944 + LdsOffsetMetadata_Blk: 84480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MacroTileMXSA: 128 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT191Q3CWl85SU-4agRfSbxDDRYJaTIDmYF_adG8G4wdfNo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27136 + LdsOffsetB_Blk: 92672 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 92672 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MacroTileMXSA: 192 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12kwdCHpnp9d41ufLMEwnWbNNXbmlKzUrCttuc0YSlgSY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19456 + LdsOffsetB_Blk: 84992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19456 + LdsOffsetMetadata_Blk: 84992 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileMXSA: 128 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + ThreadTileMXSA: 16 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25s2SqPqjqQTrutnl4Fjsck3utFxYLnrt2GxRGoMabbq0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileMXSA: 256 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + ThreadTileMXSA: 32 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19--ULatdGkaEyMxugPoO0m7KB92W8g0H0S_IE47lq-mE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MacroTileMXSA: 192 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + ThreadTileMXSA: 24 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25H-6yQcCGxSKwn5gmGJAHA93oK-ekIRJm5JqDOya5ASk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36352 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36352 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MacroTileMXSA: 256 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + ThreadTileMXSA: 32 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT649Xx58paO0aNamHRyhpgRWxjDG0kBNJ5sAnNiMy2USyI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11264 + LdsOffsetB_Blk: 76800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11264 + LdsOffsetMetadata_Blk: 76800 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 10] + MIWaveTileA: 2 + MIWaveTileB: 10 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 + MacroTileMXSA: 64 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 10 + ThreadTileA: 8 + ThreadTileB: 10 + ThreadTileMXSA: 8 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32yn2lSZhgGG39-O0tN9XB-c6Nr5kcbdLKPNrKxqSEhVE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44032 + LdsOffsetB_Blk: 109568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44032 + LdsOffsetMetadata_Blk: 109568 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 64 + MacroTileA: 320 + MacroTileB: 64 + MacroTileMXSA: 320 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + ThreadTileMXSA: 40 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64cx8jpYzIAVvV24Gqg4hoX8QdixxbIDmUlvaleHLuajs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11776 + LdsOffsetB_Blk: 77312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11776 + LdsOffsetMetadata_Blk: 77312 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 12] + MIWaveTileA: 2 + MIWaveTileB: 12 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 384 + MacroTileA: 64 + MacroTileB: 384 + MacroTileMXSA: 64 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 12 + ThreadTileA: 8 + ThreadTileB: 12 + ThreadTileMXSA: 8 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38m0Bm4hH2y_fb6-eo4XZRKb5dkkBhjv6q0Rd-CAzTq-Q= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52736 + LdsOffsetB_Blk: 118272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 118272 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 64 + MacroTileA: 384 + MacroTileB: 64 + MacroTileMXSA: 384 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + ThreadTileMXSA: 48 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64ghBzLJyXgx9oTwgzYUUty2EUDAM9Q1qjBKNRjSDH1Qg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 86528 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 16] + MIWaveTileA: 2 + MIWaveTileB: 16 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 + MacroTileMXSA: 64 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + ThreadTileMXSA: 8 + ThreadTileMXSB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51N7xP8nenZXqX1PoWvPtuxa4rTpP69DwZBgRtHN8axrQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 70144 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 143872 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 147968 + LdsOffsetMetadata: 70144 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [16, 2] + MIWaveTileA: 16 + MIWaveTileB: 2 + MIWaveTileMXSA: 16 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 64 + MacroTileA: 512 + MacroTileB: 64 + MacroTileMXSA: 512 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + ThreadTileMXSA: 64 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Tzx41lFmgkHO5_GDlXlBjYPnuHfWnU6iLujoAD27WHo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19968 + LdsOffsetB_Blk: 85504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 85504 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 320 + MacroTileA: 128 + MacroTileB: 320 + MacroTileMXSA: 128 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT329TK1O9xqAG2cUdkJ_6Jkg-kbMKNtFWKGnhC0gVHbf3c= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44544 + LdsOffsetB_Blk: 110080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 110080 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 128 + MacroTileA: 320 + MacroTileB: 128 + MacroTileMXSA: 320 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Ko14x9Bm17skxIK2J-9eSMhxZf1WQTws5W6lZ4ij_RU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MacroTileMXSA: 128 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + ThreadTileMXSA: 16 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38PfCBP4cNoa67MDHKbRKQKP0baWWAor2QL1_1-ioSnxM= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 53248 + LdsOffsetB_Blk: 122880 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 118784 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 122880 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 128 + MacroTileA: 384 + MacroTileB: 128 + MacroTileMXSA: 384 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + ThreadTileMXSA: 48 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32GHMtx7cclUEZq8HXp3EV83TsDigTCOy4HpQzcwTdNNQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 45568 + LdsOffsetB_Blk: 123904 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 119296 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 45568 + LdsOffsetMetadata_Blk: 123904 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 8] + MIWaveTileA: 10 + MIWaveTileB: 8 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 256 + MacroTileA: 320 + MacroTileB: 256 + MacroTileMXSA: 320 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 8 + ThreadTileA: 40 + ThreadTileB: 8 + ThreadTileMXSA: 40 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25yENsNJL4B9Vb5GlLD7ofBX9eYUJtt3JL_VzKdaPOs5g= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 37376 + LdsOffsetB_Blk: 115712 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 111104 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 115712 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 10] + MIWaveTileA: 8 + MIWaveTileB: 10 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 + MacroTileMXSA: 256 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 10 + ThreadTileA: 32 + ThreadTileB: 10 + ThreadTileMXSA: 32 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64IqbWxZkPcZWy3vA8g9HsDHfQ7n72vJptvmQ79eUQ7kk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 14] + MIWaveTileA: 2 + MIWaveTileB: 14 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 448 + MacroTileA: 64 + MacroTileB: 448 + MacroTileMXSA: 64 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 14 + ThreadTileA: 8 + ThreadTileB: 14 + ThreadTileMXSA: 8 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44JIOkXLbpjhS22SFIEupmgY9bhtzcXu69kfhwBJBO5wo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 61440 + LdsOffsetB_Blk: 131072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 126976 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 131072 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 2] + MIWaveTileA: 14 + MIWaveTileB: 2 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 64 + MacroTileA: 448 + MacroTileB: 64 + MacroTileMXSA: 448 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 2 + ThreadTileA: 56 + ThreadTileB: 2 + ThreadTileMXSA: 56 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12NCUb78G7QzN6ng2b3-8rphTRtJXo20S1KvmwH6k1FLQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 94720 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 95744 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 + MacroTileMXSA: 128 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + ThreadTileMXSA: 16 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44K2rcrzW2gkNghqcoHIxk3lU0xSXGNO6q0t-lzqdm54M= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 61952 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 135680 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 139264 + LdsOffsetMetadata: 61952 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 4] + MIWaveTileA: 14 + MIWaveTileB: 4 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 128 + MacroTileA: 448 + MacroTileB: 128 + MacroTileMXSA: 448 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 4 + ThreadTileA: 56 + ThreadTileB: 4 + ThreadTileMXSA: 56 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT197rqXtfRTrgckooWIHV9QRZkAUvBYDajyR7fyXPSaOfI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 29184 + LdsOffsetB_Blk: 107520 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 102912 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 107520 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 12] + MIWaveTileA: 6 + MIWaveTileB: 12 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MacroTileMXSA: 192 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 12 + ThreadTileA: 24 + ThreadTileB: 12 + ThreadTileMXSA: 24 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38M4Hg58ewW76BBGjsIKCCfd9HK3CXsd22WqnddQrMdIo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 53760 + LdsOffsetB_Blk: 132096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 127488 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 53760 + LdsOffsetMetadata_Blk: 132096 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 6] + MIWaveTileA: 12 + MIWaveTileB: 6 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 192 + MacroTileA: 384 + MacroTileB: 192 + MacroTileMXSA: 384 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 6 + ThreadTileA: 48 + ThreadTileB: 6 + ThreadTileMXSA: 48 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12pSBqfd5dzj78eX1t5EQJzYZ5aHFVtQ9uJHS361sw3qY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17664 + LdsOffsetB_Blk: 50432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 50432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileMXSA: 128 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12lYJO4P9-MC57Obi2kvcZAOwe_1ysdtWgX7-VIBNgQpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18176 + LdsOffsetB_Blk: 50944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 18176 + LdsOffsetMetadata_Blk: 50944 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileMXSA: 128 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25ccxCqQ2dvGclrJYn0QqExpGJSq56GiHUJPL0jzB0qVw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35072 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35072 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileMXSA: 256 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25LD47Rd1FCRkyTzzAeZkhMOPqncN77I3VtqjftUOo3W0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35584 + LdsOffsetB_Blk: 101120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 101120 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MacroTileMXSA: 256 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT252kruXsNK48kysY3byfZ8nkfbVQgYyySEfltsnGOOr7I= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 5 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 160 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36096 + LdsOffsetB_Blk: 101632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 101632 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MacroTileMXSA: 256 + MacroTileMXSB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25K1mXD-Z64cKGl1n64cEhRIvEdarIIYY6qzM6iXFNutU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 9 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 288 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 37120 + LdsOffsetB_Blk: 111104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 106752 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 108800 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 111104 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 18] + MIWaveTileA: 4 + MIWaveTileB: 18 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 18 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MacroTileMXSA: 256 + MacroTileMXSB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 18 + ThreadTileA: 16 + ThreadTileB: 18 + ThreadTileMXSA: 16 + ThreadTileMXSB: 18 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT382wXw8HnHr0540PBKyCMkiG1GnHC8Qq6QTA2UY90u0Us= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52480 + LdsOffsetB_Blk: 118016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52480 + LdsOffsetMetadata_Blk: 118016 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 32 + MacroTileA: 384 + MacroTileB: 32 + MacroTileMXSA: 384 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38iWtaR8v4gfASTnshP-7Zt37fJ-3bIKngSBlzEMd0qgE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52992 + LdsOffsetB_Blk: 118528 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52992 + LdsOffsetMetadata_Blk: 118528 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MacroTileMXSA: 384 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51My4B9dUWEKuVinUMAqyTn-9nVKbOgw4LZNuajpvbgno= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 69888 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 139520 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 143616 + LdsOffsetMetadata: 69888 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 32 + MacroTileA: 512 + MacroTileB: 32 + MacroTileMXSA: 512 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32dv3nH_ug3zWfzXFJEJHJUvkpjXQjAQVml90QMZvwYvs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5376 + LdsOffsetB_Blk: 38144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 36864 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 37120 + LdsOffsetMetadata: 5376 + LdsOffsetMetadata_Blk: 38144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileMXSA: 32 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96VdJ139l5ExTu5vrbSyr9YIDnqGsFY6rZMfGBgYGwyok= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14080 + LdsOffsetB_Blk: 46848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 45056 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 45824 + LdsOffsetMetadata: 14080 + LdsOffsetMetadata_Blk: 46848 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileMXSA: 96 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32pDTSO9Ei8iprYUZVwZ8MVK8XTu8S8ugrlkQJj7C1FJQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 6400 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileMXSA: 32 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96Cj2Z80w1PkWDv8Q4NtOmErGoRYnlYoT1xjNw40Qn6y8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15104 + LdsOffsetB_Blk: 80640 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 15104 + LdsOffsetMetadata_Blk: 80640 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MacroTileMXSA: 96 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT16rkdIx0xJ9AdmxSLTSuPL_HwvXURN3d9JBsT0i2N2r4A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 5 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 160 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1280 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23808 + LdsOffsetB_Blk: 89344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 20480 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 21760 + LdsOffsetMXSB_Blk: 87296 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 89344 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MacroTileMXSA: 160 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT28HX9AX04H5gx2Hnt9CFkwE4Io5BNr8Him7sxPkQ6FxSI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 9 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 288 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2304 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 41216 + LdsOffsetB_Blk: 115200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 36864 + LdsOffsetMXSA_Blk: 110848 + LdsOffsetMXSB: 39168 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 115200 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [18, 4] + MIWaveTileA: 18 + MIWaveTileB: 4 + MIWaveTileMXSA: 18 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 256 + MacroTileA: 288 + MacroTileB: 256 + MacroTileMXSA: 288 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 72 + ThreadTile1: 4 + ThreadTileA: 72 + ThreadTileB: 4 + ThreadTileMXSA: 72 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT323cjBEjqTj-Mu9ZF1RsqJ73WqkJOeA3kpk9e2rgOyJEQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7424 + LdsOffsetB_Blk: 72960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 7424 + LdsOffsetMetadata_Blk: 72960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 384 + MacroTileA: 32 + MacroTileB: 384 + MacroTileMXSA: 32 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32k1L8EqrYflL4kvotLP49gssYPXLs5aDdnaskyCeVRw4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 78080 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 512 + MacroTileA: 32 + MacroTileB: 512 + MacroTileMXSA: 32 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT9680-vsketBKOwATFxdutsK5i_NcnvLOonACpwOS25BUY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MacroTileMXSA: 96 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 00000000000..b80bc8ae2ac --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,16950 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 21 + DataTypeA: 21 + DataTypeAmaxD: 0 + DataTypeB: 21 + DataTypeE: 7 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index01MXSA: 0 + Index01MXSB: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMXSA: [3, 0, 2] + IndexAssignmentsMXSB: [3, 1, 2] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndexUnrollMXSA: 0 + IndexUnrollMXSB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 32 + MXBlockB: 32 + MacDataTypeA: 21 + MacDataTypeB: 21 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + TLUMXSA: false + TLUMXSB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hIsnhFT2L5uw_T8ryWf3mri6PtKpwJ1E97MC3rNtxpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileMXSA: 64 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128AMjoHHGnVSJ1fjvuIJeKPVtejro7lpRL6TixtWnDac= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileMXSA: 128 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + ThreadTileMXSA: 16 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19tVFE7Mochg2Q71HzQS4tIGctt0Td3p2SmmxHjjWSYTc= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MacroTileMXSA: 192 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25YgPR4XPaMfFjkFVdEJNxrCtJ2m9IQRE3ggDN5hyNKfg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 36864 + LdsOffsetB_Blk: 106496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 102400 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 106496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileMXSA: 256 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 256 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + ThreadTileMXSA: 32 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64yR3-UwlktRfU59ACgTfmpcg_Yk6f2Lb1knA4QU-BsnY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9728 + LdsOffsetB_Blk: 42496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 42496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileMXSA: 64 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12ZclQEHdRRO1sBJL3TwidM33yIt-WYnDtDgY9P9JYRX4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 50688 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17920 + LdsOffsetMetadata_Blk: 50688 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileMXSA: 128 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hJRTC6QzyytWkHaKB_tyuSMxvHfrHLHmDLGhNVVcILE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 75776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 75776 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MacroTileMXSA: 64 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19UCUD2vak2YLHvH_Ztg5zXPDJUlX3NzBJWRuGhvKVzu8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26624 + LdsOffsetB_Blk: 92160 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 92160 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MacroTileMXSA: 192 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64sCK37FdA-O3yvoTtVtg3QFjEpjbAv0RNmVuI_Ry968A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10752 + LdsOffsetB_Blk: 76288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10752 + LdsOffsetMetadata_Blk: 76288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MacroTileMXSA: 64 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25Tmym63Cp1OVwmuklAuBYLzHtSt0oUezVbK-tmV8hqYs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35328 + LdsOffsetB_Blk: 100864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 100864 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileMXSA: 256 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Qs31lTSyFaUbDqXQHGpUViUgkCHLEWWfNifVlN3xPcA= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18944 + LdsOffsetB_Blk: 84480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18944 + LdsOffsetMetadata_Blk: 84480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MacroTileMXSA: 128 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT191Q3CWl85SU-4agRfSbxDDRYJaTIDmYF_adG8G4wdfNo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27136 + LdsOffsetB_Blk: 92672 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 92672 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MacroTileMXSA: 192 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12kwdCHpnp9d41ufLMEwnWbNNXbmlKzUrCttuc0YSlgSY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19456 + LdsOffsetB_Blk: 84992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19456 + LdsOffsetMetadata_Blk: 84992 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileMXSA: 128 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + ThreadTileMXSA: 16 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25s2SqPqjqQTrutnl4Fjsck3utFxYLnrt2GxRGoMabbq0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileMXSA: 256 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + ThreadTileMXSA: 32 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19--ULatdGkaEyMxugPoO0m7KB92W8g0H0S_IE47lq-mE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MacroTileMXSA: 192 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + ThreadTileMXSA: 24 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25H-6yQcCGxSKwn5gmGJAHA93oK-ekIRJm5JqDOya5ASk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36352 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36352 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MacroTileMXSA: 256 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + ThreadTileMXSA: 32 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT649Xx58paO0aNamHRyhpgRWxjDG0kBNJ5sAnNiMy2USyI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11264 + LdsOffsetB_Blk: 76800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11264 + LdsOffsetMetadata_Blk: 76800 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 10] + MIWaveTileA: 2 + MIWaveTileB: 10 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 + MacroTileMXSA: 64 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 10 + ThreadTileA: 8 + ThreadTileB: 10 + ThreadTileMXSA: 8 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32yn2lSZhgGG39-O0tN9XB-c6Nr5kcbdLKPNrKxqSEhVE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44032 + LdsOffsetB_Blk: 109568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44032 + LdsOffsetMetadata_Blk: 109568 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 64 + MacroTileA: 320 + MacroTileB: 64 + MacroTileMXSA: 320 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + ThreadTileMXSA: 40 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64cx8jpYzIAVvV24Gqg4hoX8QdixxbIDmUlvaleHLuajs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11776 + LdsOffsetB_Blk: 77312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11776 + LdsOffsetMetadata_Blk: 77312 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 12] + MIWaveTileA: 2 + MIWaveTileB: 12 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 384 + MacroTileA: 64 + MacroTileB: 384 + MacroTileMXSA: 64 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 12 + ThreadTileA: 8 + ThreadTileB: 12 + ThreadTileMXSA: 8 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38m0Bm4hH2y_fb6-eo4XZRKb5dkkBhjv6q0Rd-CAzTq-Q= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52736 + LdsOffsetB_Blk: 118272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 118272 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 64 + MacroTileA: 384 + MacroTileB: 64 + MacroTileMXSA: 384 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + ThreadTileMXSA: 48 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64ghBzLJyXgx9oTwgzYUUty2EUDAM9Q1qjBKNRjSDH1Qg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 86528 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 16] + MIWaveTileA: 2 + MIWaveTileB: 16 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 + MacroTileMXSA: 64 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + ThreadTileMXSA: 8 + ThreadTileMXSB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51N7xP8nenZXqX1PoWvPtuxa4rTpP69DwZBgRtHN8axrQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 70144 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 143872 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 147968 + LdsOffsetMetadata: 70144 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [16, 2] + MIWaveTileA: 16 + MIWaveTileB: 2 + MIWaveTileMXSA: 16 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 64 + MacroTileA: 512 + MacroTileB: 64 + MacroTileMXSA: 512 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + ThreadTileMXSA: 64 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Tzx41lFmgkHO5_GDlXlBjYPnuHfWnU6iLujoAD27WHo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19968 + LdsOffsetB_Blk: 85504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 85504 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 320 + MacroTileA: 128 + MacroTileB: 320 + MacroTileMXSA: 128 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT329TK1O9xqAG2cUdkJ_6Jkg-kbMKNtFWKGnhC0gVHbf3c= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44544 + LdsOffsetB_Blk: 110080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 110080 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 128 + MacroTileA: 320 + MacroTileB: 128 + MacroTileMXSA: 320 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Ko14x9Bm17skxIK2J-9eSMhxZf1WQTws5W6lZ4ij_RU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MacroTileMXSA: 128 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + ThreadTileMXSA: 16 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38PfCBP4cNoa67MDHKbRKQKP0baWWAor2QL1_1-ioSnxM= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 53248 + LdsOffsetB_Blk: 122880 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 118784 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 122880 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 128 + MacroTileA: 384 + MacroTileB: 128 + MacroTileMXSA: 384 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + ThreadTileMXSA: 48 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32GHMtx7cclUEZq8HXp3EV83TsDigTCOy4HpQzcwTdNNQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 45568 + LdsOffsetB_Blk: 123904 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 119296 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 45568 + LdsOffsetMetadata_Blk: 123904 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 8] + MIWaveTileA: 10 + MIWaveTileB: 8 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 256 + MacroTileA: 320 + MacroTileB: 256 + MacroTileMXSA: 320 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 8 + ThreadTileA: 40 + ThreadTileB: 8 + ThreadTileMXSA: 40 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25yENsNJL4B9Vb5GlLD7ofBX9eYUJtt3JL_VzKdaPOs5g= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 37376 + LdsOffsetB_Blk: 115712 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 111104 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 115712 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 10] + MIWaveTileA: 8 + MIWaveTileB: 10 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 + MacroTileMXSA: 256 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 10 + ThreadTileA: 32 + ThreadTileB: 10 + ThreadTileMXSA: 32 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64IqbWxZkPcZWy3vA8g9HsDHfQ7n72vJptvmQ79eUQ7kk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 14] + MIWaveTileA: 2 + MIWaveTileB: 14 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 448 + MacroTileA: 64 + MacroTileB: 448 + MacroTileMXSA: 64 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 14 + ThreadTileA: 8 + ThreadTileB: 14 + ThreadTileMXSA: 8 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44JIOkXLbpjhS22SFIEupmgY9bhtzcXu69kfhwBJBO5wo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 61440 + LdsOffsetB_Blk: 131072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 126976 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 131072 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 2] + MIWaveTileA: 14 + MIWaveTileB: 2 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 64 + MacroTileA: 448 + MacroTileB: 64 + MacroTileMXSA: 448 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 2 + ThreadTileA: 56 + ThreadTileB: 2 + ThreadTileMXSA: 56 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12NCUb78G7QzN6ng2b3-8rphTRtJXo20S1KvmwH6k1FLQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 94720 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 95744 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 + MacroTileMXSA: 128 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + ThreadTileMXSA: 16 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44K2rcrzW2gkNghqcoHIxk3lU0xSXGNO6q0t-lzqdm54M= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 61952 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 135680 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 139264 + LdsOffsetMetadata: 61952 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 4] + MIWaveTileA: 14 + MIWaveTileB: 4 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 128 + MacroTileA: 448 + MacroTileB: 128 + MacroTileMXSA: 448 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 4 + ThreadTileA: 56 + ThreadTileB: 4 + ThreadTileMXSA: 56 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT197rqXtfRTrgckooWIHV9QRZkAUvBYDajyR7fyXPSaOfI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 29184 + LdsOffsetB_Blk: 107520 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 102912 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 107520 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 12] + MIWaveTileA: 6 + MIWaveTileB: 12 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MacroTileMXSA: 192 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 12 + ThreadTileA: 24 + ThreadTileB: 12 + ThreadTileMXSA: 24 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38M4Hg58ewW76BBGjsIKCCfd9HK3CXsd22WqnddQrMdIo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 53760 + LdsOffsetB_Blk: 132096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 127488 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 53760 + LdsOffsetMetadata_Blk: 132096 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 6] + MIWaveTileA: 12 + MIWaveTileB: 6 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 192 + MacroTileA: 384 + MacroTileB: 192 + MacroTileMXSA: 384 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 6 + ThreadTileA: 48 + ThreadTileB: 6 + ThreadTileMXSA: 48 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12pSBqfd5dzj78eX1t5EQJzYZ5aHFVtQ9uJHS361sw3qY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17664 + LdsOffsetB_Blk: 50432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 50432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileMXSA: 128 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12lYJO4P9-MC57Obi2kvcZAOwe_1ysdtWgX7-VIBNgQpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18176 + LdsOffsetB_Blk: 50944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 18176 + LdsOffsetMetadata_Blk: 50944 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileMXSA: 128 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25ccxCqQ2dvGclrJYn0QqExpGJSq56GiHUJPL0jzB0qVw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35072 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35072 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileMXSA: 256 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25LD47Rd1FCRkyTzzAeZkhMOPqncN77I3VtqjftUOo3W0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35584 + LdsOffsetB_Blk: 101120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 101120 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MacroTileMXSA: 256 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT252kruXsNK48kysY3byfZ8nkfbVQgYyySEfltsnGOOr7I= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 5 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 160 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36096 + LdsOffsetB_Blk: 101632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 101632 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MacroTileMXSA: 256 + MacroTileMXSB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25K1mXD-Z64cKGl1n64cEhRIvEdarIIYY6qzM6iXFNutU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 9 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 288 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 37120 + LdsOffsetB_Blk: 111104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 106752 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 108800 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 111104 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 18] + MIWaveTileA: 4 + MIWaveTileB: 18 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 18 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MacroTileMXSA: 256 + MacroTileMXSB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 18 + ThreadTileA: 16 + ThreadTileB: 18 + ThreadTileMXSA: 16 + ThreadTileMXSB: 18 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT382wXw8HnHr0540PBKyCMkiG1GnHC8Qq6QTA2UY90u0Us= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52480 + LdsOffsetB_Blk: 118016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52480 + LdsOffsetMetadata_Blk: 118016 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 32 + MacroTileA: 384 + MacroTileB: 32 + MacroTileMXSA: 384 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38iWtaR8v4gfASTnshP-7Zt37fJ-3bIKngSBlzEMd0qgE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52992 + LdsOffsetB_Blk: 118528 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52992 + LdsOffsetMetadata_Blk: 118528 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MacroTileMXSA: 384 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51My4B9dUWEKuVinUMAqyTn-9nVKbOgw4LZNuajpvbgno= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 69888 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 139520 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 143616 + LdsOffsetMetadata: 69888 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 32 + MacroTileA: 512 + MacroTileB: 32 + MacroTileMXSA: 512 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32dv3nH_ug3zWfzXFJEJHJUvkpjXQjAQVml90QMZvwYvs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5376 + LdsOffsetB_Blk: 38144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 36864 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 37120 + LdsOffsetMetadata: 5376 + LdsOffsetMetadata_Blk: 38144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileMXSA: 32 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96VdJ139l5ExTu5vrbSyr9YIDnqGsFY6rZMfGBgYGwyok= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14080 + LdsOffsetB_Blk: 46848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 45056 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 45824 + LdsOffsetMetadata: 14080 + LdsOffsetMetadata_Blk: 46848 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileMXSA: 96 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32pDTSO9Ei8iprYUZVwZ8MVK8XTu8S8ugrlkQJj7C1FJQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 6400 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileMXSA: 32 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96Cj2Z80w1PkWDv8Q4NtOmErGoRYnlYoT1xjNw40Qn6y8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15104 + LdsOffsetB_Blk: 80640 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 15104 + LdsOffsetMetadata_Blk: 80640 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MacroTileMXSA: 96 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT16rkdIx0xJ9AdmxSLTSuPL_HwvXURN3d9JBsT0i2N2r4A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 5 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 160 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1280 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23808 + LdsOffsetB_Blk: 89344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 20480 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 21760 + LdsOffsetMXSB_Blk: 87296 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 89344 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MacroTileMXSA: 160 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT28HX9AX04H5gx2Hnt9CFkwE4Io5BNr8Him7sxPkQ6FxSI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 9 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 288 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2304 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 41216 + LdsOffsetB_Blk: 115200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 36864 + LdsOffsetMXSA_Blk: 110848 + LdsOffsetMXSB: 39168 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 115200 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [18, 4] + MIWaveTileA: 18 + MIWaveTileB: 4 + MIWaveTileMXSA: 18 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 256 + MacroTileA: 288 + MacroTileB: 256 + MacroTileMXSA: 288 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 72 + ThreadTile1: 4 + ThreadTileA: 72 + ThreadTileB: 4 + ThreadTileMXSA: 72 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT323cjBEjqTj-Mu9ZF1RsqJ73WqkJOeA3kpk9e2rgOyJEQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7424 + LdsOffsetB_Blk: 72960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 7424 + LdsOffsetMetadata_Blk: 72960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 384 + MacroTileA: 32 + MacroTileB: 384 + MacroTileMXSA: 32 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32k1L8EqrYflL4kvotLP49gssYPXLs5aDdnaskyCeVRw4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 78080 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 512 + MacroTileA: 32 + MacroTileB: 512 + MacroTileMXSA: 32 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT9680-vsketBKOwATFxdutsK5i_NcnvLOonACpwOS25BUY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MacroTileMXSA: 96 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4HS_MXA32_MXB32_BH_BiasSH_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4HS_MXA32_MXB32_BH_BiasSH_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 00000000000..7a91c02510b --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4HS_MXA32_MXB32_BH_BiasSH_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,16950 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 4] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 21 + DataTypeA: 21 + DataTypeAmaxD: 0 + DataTypeB: 21 + DataTypeE: 4 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index01MXSA: 0 + Index01MXSB: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMXSA: [3, 0, 2] + IndexAssignmentsMXSB: [3, 1, 2] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndexUnrollMXSA: 0 + IndexUnrollMXSB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 32 + MXBlockB: 32 + MacDataTypeA: 21 + MacDataTypeB: 21 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + TLUMXSA: false + TLUMXSB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hIsnhFT2L5uw_T8ryWf3mri6PtKpwJ1E97MC3rNtxpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileMXSA: 64 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128AMjoHHGnVSJ1fjvuIJeKPVtejro7lpRL6TixtWnDac= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileMXSA: 128 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + ThreadTileMXSA: 16 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19tVFE7Mochg2Q71HzQS4tIGctt0Td3p2SmmxHjjWSYTc= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MacroTileMXSA: 192 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25YgPR4XPaMfFjkFVdEJNxrCtJ2m9IQRE3ggDN5hyNKfg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 36864 + LdsOffsetB_Blk: 106496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 102400 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 106496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileMXSA: 256 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 256 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + ThreadTileMXSA: 32 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64yR3-UwlktRfU59ACgTfmpcg_Yk6f2Lb1knA4QU-BsnY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9728 + LdsOffsetB_Blk: 42496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 42496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileMXSA: 64 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12ZclQEHdRRO1sBJL3TwidM33yIt-WYnDtDgY9P9JYRX4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 50688 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17920 + LdsOffsetMetadata_Blk: 50688 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileMXSA: 128 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hJRTC6QzyytWkHaKB_tyuSMxvHfrHLHmDLGhNVVcILE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 75776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 75776 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MacroTileMXSA: 64 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19UCUD2vak2YLHvH_Ztg5zXPDJUlX3NzBJWRuGhvKVzu8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26624 + LdsOffsetB_Blk: 92160 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 92160 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MacroTileMXSA: 192 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64sCK37FdA-O3yvoTtVtg3QFjEpjbAv0RNmVuI_Ry968A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10752 + LdsOffsetB_Blk: 76288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10752 + LdsOffsetMetadata_Blk: 76288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MacroTileMXSA: 64 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25Tmym63Cp1OVwmuklAuBYLzHtSt0oUezVbK-tmV8hqYs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35328 + LdsOffsetB_Blk: 100864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 100864 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileMXSA: 256 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Qs31lTSyFaUbDqXQHGpUViUgkCHLEWWfNifVlN3xPcA= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18944 + LdsOffsetB_Blk: 84480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18944 + LdsOffsetMetadata_Blk: 84480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MacroTileMXSA: 128 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT191Q3CWl85SU-4agRfSbxDDRYJaTIDmYF_adG8G4wdfNo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27136 + LdsOffsetB_Blk: 92672 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 92672 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MacroTileMXSA: 192 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12kwdCHpnp9d41ufLMEwnWbNNXbmlKzUrCttuc0YSlgSY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19456 + LdsOffsetB_Blk: 84992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19456 + LdsOffsetMetadata_Blk: 84992 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileMXSA: 128 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + ThreadTileMXSA: 16 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25s2SqPqjqQTrutnl4Fjsck3utFxYLnrt2GxRGoMabbq0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileMXSA: 256 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + ThreadTileMXSA: 32 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19--ULatdGkaEyMxugPoO0m7KB92W8g0H0S_IE47lq-mE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MacroTileMXSA: 192 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + ThreadTileMXSA: 24 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25H-6yQcCGxSKwn5gmGJAHA93oK-ekIRJm5JqDOya5ASk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36352 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36352 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MacroTileMXSA: 256 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + ThreadTileMXSA: 32 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT649Xx58paO0aNamHRyhpgRWxjDG0kBNJ5sAnNiMy2USyI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11264 + LdsOffsetB_Blk: 76800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11264 + LdsOffsetMetadata_Blk: 76800 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 10] + MIWaveTileA: 2 + MIWaveTileB: 10 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 + MacroTileMXSA: 64 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 10 + ThreadTileA: 8 + ThreadTileB: 10 + ThreadTileMXSA: 8 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32yn2lSZhgGG39-O0tN9XB-c6Nr5kcbdLKPNrKxqSEhVE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44032 + LdsOffsetB_Blk: 109568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44032 + LdsOffsetMetadata_Blk: 109568 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 64 + MacroTileA: 320 + MacroTileB: 64 + MacroTileMXSA: 320 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + ThreadTileMXSA: 40 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64cx8jpYzIAVvV24Gqg4hoX8QdixxbIDmUlvaleHLuajs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11776 + LdsOffsetB_Blk: 77312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11776 + LdsOffsetMetadata_Blk: 77312 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 12] + MIWaveTileA: 2 + MIWaveTileB: 12 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 384 + MacroTileA: 64 + MacroTileB: 384 + MacroTileMXSA: 64 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 12 + ThreadTileA: 8 + ThreadTileB: 12 + ThreadTileMXSA: 8 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38m0Bm4hH2y_fb6-eo4XZRKb5dkkBhjv6q0Rd-CAzTq-Q= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52736 + LdsOffsetB_Blk: 118272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 118272 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 64 + MacroTileA: 384 + MacroTileB: 64 + MacroTileMXSA: 384 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + ThreadTileMXSA: 48 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64ghBzLJyXgx9oTwgzYUUty2EUDAM9Q1qjBKNRjSDH1Qg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 86528 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 16] + MIWaveTileA: 2 + MIWaveTileB: 16 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 + MacroTileMXSA: 64 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + ThreadTileMXSA: 8 + ThreadTileMXSB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51N7xP8nenZXqX1PoWvPtuxa4rTpP69DwZBgRtHN8axrQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 70144 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 143872 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 147968 + LdsOffsetMetadata: 70144 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [16, 2] + MIWaveTileA: 16 + MIWaveTileB: 2 + MIWaveTileMXSA: 16 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 64 + MacroTileA: 512 + MacroTileB: 64 + MacroTileMXSA: 512 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + ThreadTileMXSA: 64 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Tzx41lFmgkHO5_GDlXlBjYPnuHfWnU6iLujoAD27WHo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19968 + LdsOffsetB_Blk: 85504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 85504 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 320 + MacroTileA: 128 + MacroTileB: 320 + MacroTileMXSA: 128 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT329TK1O9xqAG2cUdkJ_6Jkg-kbMKNtFWKGnhC0gVHbf3c= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44544 + LdsOffsetB_Blk: 110080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 110080 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 128 + MacroTileA: 320 + MacroTileB: 128 + MacroTileMXSA: 320 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Ko14x9Bm17skxIK2J-9eSMhxZf1WQTws5W6lZ4ij_RU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MacroTileMXSA: 128 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + ThreadTileMXSA: 16 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38PfCBP4cNoa67MDHKbRKQKP0baWWAor2QL1_1-ioSnxM= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 53248 + LdsOffsetB_Blk: 122880 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 118784 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 122880 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 128 + MacroTileA: 384 + MacroTileB: 128 + MacroTileMXSA: 384 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + ThreadTileMXSA: 48 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32GHMtx7cclUEZq8HXp3EV83TsDigTCOy4HpQzcwTdNNQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 45568 + LdsOffsetB_Blk: 123904 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 119296 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 45568 + LdsOffsetMetadata_Blk: 123904 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 8] + MIWaveTileA: 10 + MIWaveTileB: 8 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 256 + MacroTileA: 320 + MacroTileB: 256 + MacroTileMXSA: 320 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 8 + ThreadTileA: 40 + ThreadTileB: 8 + ThreadTileMXSA: 40 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25yENsNJL4B9Vb5GlLD7ofBX9eYUJtt3JL_VzKdaPOs5g= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 37376 + LdsOffsetB_Blk: 115712 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 111104 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 115712 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 10] + MIWaveTileA: 8 + MIWaveTileB: 10 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 + MacroTileMXSA: 256 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 10 + ThreadTileA: 32 + ThreadTileB: 10 + ThreadTileMXSA: 32 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64IqbWxZkPcZWy3vA8g9HsDHfQ7n72vJptvmQ79eUQ7kk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 14] + MIWaveTileA: 2 + MIWaveTileB: 14 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 448 + MacroTileA: 64 + MacroTileB: 448 + MacroTileMXSA: 64 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 14 + ThreadTileA: 8 + ThreadTileB: 14 + ThreadTileMXSA: 8 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44JIOkXLbpjhS22SFIEupmgY9bhtzcXu69kfhwBJBO5wo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 61440 + LdsOffsetB_Blk: 131072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 126976 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 131072 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 2] + MIWaveTileA: 14 + MIWaveTileB: 2 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 64 + MacroTileA: 448 + MacroTileB: 64 + MacroTileMXSA: 448 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 2 + ThreadTileA: 56 + ThreadTileB: 2 + ThreadTileMXSA: 56 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12NCUb78G7QzN6ng2b3-8rphTRtJXo20S1KvmwH6k1FLQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 94720 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 95744 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 + MacroTileMXSA: 128 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + ThreadTileMXSA: 16 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44K2rcrzW2gkNghqcoHIxk3lU0xSXGNO6q0t-lzqdm54M= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 61952 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 135680 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 139264 + LdsOffsetMetadata: 61952 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 4] + MIWaveTileA: 14 + MIWaveTileB: 4 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 128 + MacroTileA: 448 + MacroTileB: 128 + MacroTileMXSA: 448 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 4 + ThreadTileA: 56 + ThreadTileB: 4 + ThreadTileMXSA: 56 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT197rqXtfRTrgckooWIHV9QRZkAUvBYDajyR7fyXPSaOfI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 29184 + LdsOffsetB_Blk: 107520 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 102912 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 107520 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 12] + MIWaveTileA: 6 + MIWaveTileB: 12 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MacroTileMXSA: 192 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 12 + ThreadTileA: 24 + ThreadTileB: 12 + ThreadTileMXSA: 24 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38M4Hg58ewW76BBGjsIKCCfd9HK3CXsd22WqnddQrMdIo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 53760 + LdsOffsetB_Blk: 132096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 127488 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 53760 + LdsOffsetMetadata_Blk: 132096 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 6] + MIWaveTileA: 12 + MIWaveTileB: 6 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 192 + MacroTileA: 384 + MacroTileB: 192 + MacroTileMXSA: 384 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 6 + ThreadTileA: 48 + ThreadTileB: 6 + ThreadTileMXSA: 48 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12pSBqfd5dzj78eX1t5EQJzYZ5aHFVtQ9uJHS361sw3qY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17664 + LdsOffsetB_Blk: 50432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 50432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileMXSA: 128 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12lYJO4P9-MC57Obi2kvcZAOwe_1ysdtWgX7-VIBNgQpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18176 + LdsOffsetB_Blk: 50944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 18176 + LdsOffsetMetadata_Blk: 50944 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileMXSA: 128 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25ccxCqQ2dvGclrJYn0QqExpGJSq56GiHUJPL0jzB0qVw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35072 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35072 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileMXSA: 256 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25LD47Rd1FCRkyTzzAeZkhMOPqncN77I3VtqjftUOo3W0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35584 + LdsOffsetB_Blk: 101120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 101120 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MacroTileMXSA: 256 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT252kruXsNK48kysY3byfZ8nkfbVQgYyySEfltsnGOOr7I= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 5 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 160 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36096 + LdsOffsetB_Blk: 101632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 101632 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MacroTileMXSA: 256 + MacroTileMXSB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25K1mXD-Z64cKGl1n64cEhRIvEdarIIYY6qzM6iXFNutU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 9 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 288 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 37120 + LdsOffsetB_Blk: 111104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 106752 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 108800 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 111104 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 18] + MIWaveTileA: 4 + MIWaveTileB: 18 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 18 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MacroTileMXSA: 256 + MacroTileMXSB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 18 + ThreadTileA: 16 + ThreadTileB: 18 + ThreadTileMXSA: 16 + ThreadTileMXSB: 18 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT382wXw8HnHr0540PBKyCMkiG1GnHC8Qq6QTA2UY90u0Us= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52480 + LdsOffsetB_Blk: 118016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52480 + LdsOffsetMetadata_Blk: 118016 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 32 + MacroTileA: 384 + MacroTileB: 32 + MacroTileMXSA: 384 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38iWtaR8v4gfASTnshP-7Zt37fJ-3bIKngSBlzEMd0qgE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52992 + LdsOffsetB_Blk: 118528 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52992 + LdsOffsetMetadata_Blk: 118528 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MacroTileMXSA: 384 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51My4B9dUWEKuVinUMAqyTn-9nVKbOgw4LZNuajpvbgno= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 69888 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 139520 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 143616 + LdsOffsetMetadata: 69888 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 32 + MacroTileA: 512 + MacroTileB: 32 + MacroTileMXSA: 512 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32dv3nH_ug3zWfzXFJEJHJUvkpjXQjAQVml90QMZvwYvs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5376 + LdsOffsetB_Blk: 38144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 36864 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 37120 + LdsOffsetMetadata: 5376 + LdsOffsetMetadata_Blk: 38144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileMXSA: 32 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96VdJ139l5ExTu5vrbSyr9YIDnqGsFY6rZMfGBgYGwyok= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14080 + LdsOffsetB_Blk: 46848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 45056 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 45824 + LdsOffsetMetadata: 14080 + LdsOffsetMetadata_Blk: 46848 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileMXSA: 96 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32pDTSO9Ei8iprYUZVwZ8MVK8XTu8S8ugrlkQJj7C1FJQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 6400 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileMXSA: 32 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96Cj2Z80w1PkWDv8Q4NtOmErGoRYnlYoT1xjNw40Qn6y8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15104 + LdsOffsetB_Blk: 80640 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 15104 + LdsOffsetMetadata_Blk: 80640 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MacroTileMXSA: 96 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT16rkdIx0xJ9AdmxSLTSuPL_HwvXURN3d9JBsT0i2N2r4A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 5 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 160 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1280 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23808 + LdsOffsetB_Blk: 89344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 20480 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 21760 + LdsOffsetMXSB_Blk: 87296 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 89344 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MacroTileMXSA: 160 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT28HX9AX04H5gx2Hnt9CFkwE4Io5BNr8Him7sxPkQ6FxSI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 9 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 288 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2304 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 41216 + LdsOffsetB_Blk: 115200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 36864 + LdsOffsetMXSA_Blk: 110848 + LdsOffsetMXSB: 39168 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 115200 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [18, 4] + MIWaveTileA: 18 + MIWaveTileB: 4 + MIWaveTileMXSA: 18 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 256 + MacroTileA: 288 + MacroTileB: 256 + MacroTileMXSA: 288 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 72 + ThreadTile1: 4 + ThreadTileA: 72 + ThreadTileB: 4 + ThreadTileMXSA: 72 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT323cjBEjqTj-Mu9ZF1RsqJ73WqkJOeA3kpk9e2rgOyJEQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7424 + LdsOffsetB_Blk: 72960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 7424 + LdsOffsetMetadata_Blk: 72960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 384 + MacroTileA: 32 + MacroTileB: 384 + MacroTileMXSA: 32 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32k1L8EqrYflL4kvotLP49gssYPXLs5aDdnaskyCeVRw4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 78080 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 512 + MacroTileA: 32 + MacroTileB: 512 + MacroTileMXSA: 32 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT9680-vsketBKOwATFxdutsK5i_NcnvLOonACpwOS25BUY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MacroTileMXSA: 96 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4SS_MXA32_MXB32_BH_BiasS_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4SS_MXA32_MXB32_BH_BiasS_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 00000000000..3fe998e560c --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4SS_MXA32_MXB32_BH_BiasS_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,16950 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 21 + DataTypeA: 21 + DataTypeAmaxD: 0 + DataTypeB: 21 + DataTypeE: 0 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index01MXSA: 0 + Index01MXSB: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMXSA: [3, 0, 2] + IndexAssignmentsMXSB: [3, 1, 2] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndexUnrollMXSA: 0 + IndexUnrollMXSB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 32 + MXBlockB: 32 + MacDataTypeA: 21 + MacDataTypeB: 21 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + TLUMXSA: false + TLUMXSB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hIsnhFT2L5uw_T8ryWf3mri6PtKpwJ1E97MC3rNtxpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileMXSA: 64 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128AMjoHHGnVSJ1fjvuIJeKPVtejro7lpRL6TixtWnDac= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileMXSA: 128 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + ThreadTileMXSA: 16 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19tVFE7Mochg2Q71HzQS4tIGctt0Td3p2SmmxHjjWSYTc= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MacroTileMXSA: 192 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25YgPR4XPaMfFjkFVdEJNxrCtJ2m9IQRE3ggDN5hyNKfg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 36864 + LdsOffsetB_Blk: 106496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 102400 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 106496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileMXSA: 256 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 256 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + ThreadTileMXSA: 32 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64yR3-UwlktRfU59ACgTfmpcg_Yk6f2Lb1knA4QU-BsnY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9728 + LdsOffsetB_Blk: 42496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 40960 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 41472 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 42496 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileMXSA: 64 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12ZclQEHdRRO1sBJL3TwidM33yIt-WYnDtDgY9P9JYRX4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 50688 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17920 + LdsOffsetMetadata_Blk: 50688 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileMXSA: 128 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64hJRTC6QzyytWkHaKB_tyuSMxvHfrHLHmDLGhNVVcILE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 75776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 75776 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MacroTileMXSA: 64 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19UCUD2vak2YLHvH_Ztg5zXPDJUlX3NzBJWRuGhvKVzu8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26624 + LdsOffsetB_Blk: 92160 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 92160 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MacroTileMXSA: 192 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64sCK37FdA-O3yvoTtVtg3QFjEpjbAv0RNmVuI_Ry968A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10752 + LdsOffsetB_Blk: 76288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 10752 + LdsOffsetMetadata_Blk: 76288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MacroTileMXSA: 64 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25Tmym63Cp1OVwmuklAuBYLzHtSt0oUezVbK-tmV8hqYs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35328 + LdsOffsetB_Blk: 100864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 100864 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileMXSA: 256 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Qs31lTSyFaUbDqXQHGpUViUgkCHLEWWfNifVlN3xPcA= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18944 + LdsOffsetB_Blk: 84480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 18944 + LdsOffsetMetadata_Blk: 84480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MacroTileMXSA: 128 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT191Q3CWl85SU-4agRfSbxDDRYJaTIDmYF_adG8G4wdfNo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27136 + LdsOffsetB_Blk: 92672 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 92672 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MacroTileMXSA: 192 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12kwdCHpnp9d41ufLMEwnWbNNXbmlKzUrCttuc0YSlgSY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19456 + LdsOffsetB_Blk: 84992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19456 + LdsOffsetMetadata_Blk: 84992 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileMXSA: 128 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + ThreadTileMXSA: 16 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25s2SqPqjqQTrutnl4Fjsck3utFxYLnrt2GxRGoMabbq0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileMXSA: 256 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + ThreadTileMXSA: 32 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT19--ULatdGkaEyMxugPoO0m7KB92W8g0H0S_IE47lq-mE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 90112 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 91648 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MacroTileMXSA: 192 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + ThreadTileMXSA: 24 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25H-6yQcCGxSKwn5gmGJAHA93oK-ekIRJm5JqDOya5ASk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36352 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36352 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MacroTileMXSA: 256 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + ThreadTileMXSA: 32 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT649Xx58paO0aNamHRyhpgRWxjDG0kBNJ5sAnNiMy2USyI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11264 + LdsOffsetB_Blk: 76800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11264 + LdsOffsetMetadata_Blk: 76800 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 10] + MIWaveTileA: 2 + MIWaveTileB: 10 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 + MacroTileMXSA: 64 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 10 + ThreadTileA: 8 + ThreadTileB: 10 + ThreadTileMXSA: 8 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32yn2lSZhgGG39-O0tN9XB-c6Nr5kcbdLKPNrKxqSEhVE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44032 + LdsOffsetB_Blk: 109568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44032 + LdsOffsetMetadata_Blk: 109568 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 64 + MacroTileA: 320 + MacroTileB: 64 + MacroTileMXSA: 320 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + ThreadTileMXSA: 40 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64cx8jpYzIAVvV24Gqg4hoX8QdixxbIDmUlvaleHLuajs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 11776 + LdsOffsetB_Blk: 77312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 73728 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 74240 + LdsOffsetMetadata: 11776 + LdsOffsetMetadata_Blk: 77312 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 12] + MIWaveTileA: 2 + MIWaveTileB: 12 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 384 + MacroTileA: 64 + MacroTileB: 384 + MacroTileMXSA: 64 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 12 + ThreadTileA: 8 + ThreadTileB: 12 + ThreadTileMXSA: 8 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38m0Bm4hH2y_fb6-eo4XZRKb5dkkBhjv6q0Rd-CAzTq-Q= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52736 + LdsOffsetB_Blk: 118272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 118272 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 64 + MacroTileA: 384 + MacroTileB: 64 + MacroTileMXSA: 384 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + ThreadTileMXSA: 48 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64ghBzLJyXgx9oTwgzYUUty2EUDAM9Q1qjBKNRjSDH1Qg= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 86528 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 16] + MIWaveTileA: 2 + MIWaveTileB: 16 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 + MacroTileMXSA: 64 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + ThreadTileMXSA: 8 + ThreadTileMXSB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51N7xP8nenZXqX1PoWvPtuxa4rTpP69DwZBgRtHN8axrQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 70144 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 143872 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 147968 + LdsOffsetMetadata: 70144 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [16, 2] + MIWaveTileA: 16 + MIWaveTileB: 2 + MIWaveTileMXSA: 16 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 64 + MacroTileA: 512 + MacroTileB: 64 + MacroTileMXSA: 512 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + ThreadTileMXSA: 64 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Tzx41lFmgkHO5_GDlXlBjYPnuHfWnU6iLujoAD27WHo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 19968 + LdsOffsetB_Blk: 85504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 81920 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 82944 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 85504 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 320 + MacroTileA: 128 + MacroTileB: 320 + MacroTileMXSA: 128 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT329TK1O9xqAG2cUdkJ_6Jkg-kbMKNtFWKGnhC0gVHbf3c= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 44544 + LdsOffsetB_Blk: 110080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 106496 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 109056 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 110080 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 128 + MacroTileA: 320 + MacroTileB: 128 + MacroTileMXSA: 320 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12Ko14x9Bm17skxIK2J-9eSMhxZf1WQTws5W6lZ4ij_RU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 87040 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MacroTileMXSA: 128 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + ThreadTileMXSA: 16 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38PfCBP4cNoa67MDHKbRKQKP0baWWAor2QL1_1-ioSnxM= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 53248 + LdsOffsetB_Blk: 122880 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 118784 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 122880 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 128 + MacroTileA: 384 + MacroTileB: 128 + MacroTileMXSA: 384 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + ThreadTileMXSA: 48 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32GHMtx7cclUEZq8HXp3EV83TsDigTCOy4HpQzcwTdNNQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 10 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 320 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 40960 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2560 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 45568 + LdsOffsetB_Blk: 123904 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 40960 + LdsOffsetMXSA_Blk: 119296 + LdsOffsetMXSB: 43520 + LdsOffsetMXSB_Blk: 121856 + LdsOffsetMetadata: 45568 + LdsOffsetMetadata_Blk: 123904 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 8] + MIWaveTileA: 10 + MIWaveTileB: 8 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 256 + MacroTileA: 320 + MacroTileB: 256 + MacroTileMXSA: 320 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 8 + ThreadTileA: 40 + ThreadTileB: 8 + ThreadTileMXSA: 40 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25yENsNJL4B9Vb5GlLD7ofBX9eYUJtt3JL_VzKdaPOs5g= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 10 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 320 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 37376 + LdsOffsetB_Blk: 115712 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 111104 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 115712 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 10] + MIWaveTileA: 8 + MIWaveTileB: 10 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 + MacroTileMXSA: 256 + MacroTileMXSB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 320 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 10 + ThreadTileA: 32 + ThreadTileB: 10 + ThreadTileMXSA: 32 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64IqbWxZkPcZWy3vA8g9HsDHfQ7n72vJptvmQ79eUQ7kk= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 2 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 64 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 4 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 512 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 8192 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 8704 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 14] + MIWaveTileA: 2 + MIWaveTileB: 14 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 448 + MacroTileA: 64 + MacroTileB: 448 + MacroTileMXSA: 64 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 2 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 14 + ThreadTileA: 8 + ThreadTileB: 14 + ThreadTileMXSA: 8 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44JIOkXLbpjhS22SFIEupmgY9bhtzcXu69kfhwBJBO5wo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 64 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 4 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 61440 + LdsOffsetB_Blk: 131072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 126976 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 131072 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 2] + MIWaveTileA: 14 + MIWaveTileB: 2 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 64 + MacroTileA: 448 + MacroTileB: 64 + MacroTileMXSA: 448 + MacroTileMXSB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 2 + ThreadTileA: 56 + ThreadTileB: 2 + ThreadTileMXSA: 56 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12NCUb78G7QzN6ng2b3-8rphTRtJXo20S1KvmwH6k1FLQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 14 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 448 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 3584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 94720 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 95744 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 + MacroTileMXSA: 128 + MacroTileMXSB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + ThreadTileMXSA: 16 + ThreadTileMXSB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT44K2rcrzW2gkNghqcoHIxk3lU0xSXGNO6q0t-lzqdm54M= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 14 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 448 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 57344 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 3584 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 61952 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 57344 + LdsOffsetMXSA_Blk: 135680 + LdsOffsetMXSB: 60928 + LdsOffsetMXSB_Blk: 139264 + LdsOffsetMetadata: 61952 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 4] + MIWaveTileA: 14 + MIWaveTileB: 4 + MIWaveTileMXSA: 14 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 128 + MacroTileA: 448 + MacroTileB: 128 + MacroTileMXSA: 448 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLdsBlk: 2 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 4 + ThreadTileA: 56 + ThreadTileB: 4 + ThreadTileMXSA: 56 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT197rqXtfRTrgckooWIHV9QRZkAUvBYDajyR7fyXPSaOfI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 6 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 192 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 1536 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 29184 + LdsOffsetB_Blk: 107520 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 24576 + LdsOffsetMXSA_Blk: 102912 + LdsOffsetMXSB: 26112 + LdsOffsetMXSB_Blk: 104448 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 107520 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 12] + MIWaveTileA: 6 + MIWaveTileB: 12 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MacroTileMXSA: 192 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 12 + ThreadTileA: 24 + ThreadTileB: 12 + ThreadTileMXSA: 24 + ThreadTileMXSB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38M4Hg58ewW76BBGjsIKCCfd9HK3CXsd22WqnddQrMdIo= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 6 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 192 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 1536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 53760 + LdsOffsetB_Blk: 132096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 127488 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 130560 + LdsOffsetMetadata: 53760 + LdsOffsetMetadata_Blk: 132096 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 6] + MIWaveTileA: 12 + MIWaveTileB: 6 + MIWaveTileMXSA: 12 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 192 + MacroTileA: 384 + MacroTileB: 192 + MacroTileMXSA: 384 + MacroTileMXSB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SubGroupMXSA: 8 + SubGroupMXSB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 6 + ThreadTileA: 48 + ThreadTileB: 6 + ThreadTileMXSA: 48 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12pSBqfd5dzj78eX1t5EQJzYZ5aHFVtQ9uJHS361sw3qY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17664 + LdsOffsetB_Blk: 50432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 50432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileMXSA: 128 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT12lYJO4P9-MC57Obi2kvcZAOwe_1ysdtWgX7-VIBNgQpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18176 + LdsOffsetB_Blk: 50944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 18176 + LdsOffsetMetadata_Blk: 50944 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileMXSA: 128 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25ccxCqQ2dvGclrJYn0QqExpGJSq56GiHUJPL0jzB0qVw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35072 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35072 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileMXSA: 256 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25LD47Rd1FCRkyTzzAeZkhMOPqncN77I3VtqjftUOo3W0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35584 + LdsOffsetB_Blk: 101120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 101120 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MacroTileMXSA: 256 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT252kruXsNK48kysY3byfZ8nkfbVQgYyySEfltsnGOOr7I= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 5 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 160 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36096 + LdsOffsetB_Blk: 101632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 101632 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MacroTileMXSA: 256 + MacroTileMXSB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT25K1mXD-Z64cKGl1n64cEhRIvEdarIIYY6qzM6iXFNutU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 9 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 288 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 37120 + LdsOffsetB_Blk: 111104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 106752 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 108800 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 111104 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 18] + MIWaveTileA: 4 + MIWaveTileB: 18 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 18 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MacroTileMXSA: 256 + MacroTileMXSB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 18 + ThreadTileA: 16 + ThreadTileB: 18 + ThreadTileMXSA: 16 + ThreadTileMXSB: 18 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT382wXw8HnHr0540PBKyCMkiG1GnHC8Qq6QTA2UY90u0Us= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52480 + LdsOffsetB_Blk: 118016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52480 + LdsOffsetMetadata_Blk: 118016 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 32 + MacroTileA: 384 + MacroTileB: 32 + MacroTileMXSA: 384 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT38iWtaR8v4gfASTnshP-7Zt37fJ-3bIKngSBlzEMd0qgE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52992 + LdsOffsetB_Blk: 118528 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52992 + LdsOffsetMetadata_Blk: 118528 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MacroTileMXSA: 384 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT51My4B9dUWEKuVinUMAqyTn-9nVKbOgw4LZNuajpvbgno= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 69888 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 139520 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 143616 + LdsOffsetMetadata: 69888 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 32 + MacroTileA: 512 + MacroTileB: 32 + MacroTileMXSA: 512 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32dv3nH_ug3zWfzXFJEJHJUvkpjXQjAQVml90QMZvwYvs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5376 + LdsOffsetB_Blk: 38144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 36864 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 37120 + LdsOffsetMetadata: 5376 + LdsOffsetMetadata_Blk: 38144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileMXSA: 32 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96VdJ139l5ExTu5vrbSyr9YIDnqGsFY6rZMfGBgYGwyok= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14080 + LdsOffsetB_Blk: 46848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 45056 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 45824 + LdsOffsetMetadata: 14080 + LdsOffsetMetadata_Blk: 46848 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileMXSA: 96 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32pDTSO9Ei8iprYUZVwZ8MVK8XTu8S8ugrlkQJj7C1FJQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 6400 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileMXSA: 32 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96Cj2Z80w1PkWDv8Q4NtOmErGoRYnlYoT1xjNw40Qn6y8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15104 + LdsOffsetB_Blk: 80640 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 15104 + LdsOffsetMetadata_Blk: 80640 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MacroTileMXSA: 96 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT16rkdIx0xJ9AdmxSLTSuPL_HwvXURN3d9JBsT0i2N2r4A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 5 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 160 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1280 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23808 + LdsOffsetB_Blk: 89344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 20480 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 21760 + LdsOffsetMXSB_Blk: 87296 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 89344 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MacroTileMXSA: 160 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT28HX9AX04H5gx2Hnt9CFkwE4Io5BNr8Him7sxPkQ6FxSI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 9 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 288 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2304 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 41216 + LdsOffsetB_Blk: 115200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 36864 + LdsOffsetMXSA_Blk: 110848 + LdsOffsetMXSB: 39168 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 115200 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [18, 4] + MIWaveTileA: 18 + MIWaveTileB: 4 + MIWaveTileMXSA: 18 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 256 + MacroTileA: 288 + MacroTileB: 256 + MacroTileMXSA: 288 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 72 + ThreadTile1: 4 + ThreadTileA: 72 + ThreadTileB: 4 + ThreadTileMXSA: 72 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT323cjBEjqTj-Mu9ZF1RsqJ73WqkJOeA3kpk9e2rgOyJEQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7424 + LdsOffsetB_Blk: 72960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 7424 + LdsOffsetMetadata_Blk: 72960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 384 + MacroTileA: 32 + MacroTileB: 384 + MacroTileMXSA: 32 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32k1L8EqrYflL4kvotLP49gssYPXLs5aDdnaskyCeVRw4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 78080 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 512 + MacroTileA: 32 + MacroTileB: 512 + MacroTileMXSA: 32 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 256 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT9680-vsketBKOwATFxdutsK5i_NcnvLOonACpwOS25BUY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MacroTileMXSA: 96 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_BiasSB_HAS_SAB_SAV_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/tensile_host.cpp b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/tensile_host.cpp index 8925ecdc769..270d48197c4 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/tensile_host.cpp +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/tensile_host.cpp @@ -1833,6 +1833,13 @@ namespace tensileProblem.setSwizzleTensorA(prob.swizzleA); tensileProblem.setSwizzleTensorB(prob.swizzleB); + if(prob.scaleAType == RocblasltContractionProblem::ScalingFormat::Block_32_UE8M0 or + prob.scaleAType == RocblasltContractionProblem::ScalingFormat::Block_32_UE8M0_32_8_EXT) + tensileProblem.setMXScaleA(rocisa::DataType::E8, 32); + if(prob.scaleBType == RocblasltContractionProblem::ScalingFormat::Block_32_UE8M0 or + prob.scaleBType == RocblasltContractionProblem::ScalingFormat::Block_32_UE8M0_32_8_EXT) + tensileProblem.setMXScaleB(rocisa::DataType::E8, 32); + return tensileProblem; } @@ -2837,6 +2844,17 @@ void initTensileGemmData(rocblaslt_handle handle, #ifdef HIPBLASLT_USE_ROCROLLER bool useRocRoller(rocblaslt_handle handle, const RocblasltContractionProblem& prob) { + // Do not use rocRoller for FP4 A + FP4 B with pre-swizzled (shuffled) scale layout + bool isFp4A = (prob.a_type == static_cast(HIP_R_4F_E2M1)); + bool isFp4B = (prob.b_type == static_cast(HIP_R_4F_E2M1)); + bool isShuffledScale + = (prob.scaleAType + == RocblasltContractionProblem::ScalingFormat::Block_32_UE8M0_32_8_EXT + && prob.scaleBType + == RocblasltContractionProblem::ScalingFormat::Block_32_UE8M0_32_8_EXT); + if(isFp4A && isFp4B && isShuffledScale) + return false; + return handle->useRocRoller == 1 || (handle->useRocRoller == -1 && (isBlockScaling(prob.scaleAType) || isBlockScaling(prob.scaleBType))); diff --git a/projects/hipblaslt/tensilelite/Tensile/AsmStoreState.py b/projects/hipblaslt/tensilelite/Tensile/AsmStoreState.py index f6b6b783d3a..545b9f45d3e 100644 --- a/projects/hipblaslt/tensilelite/Tensile/AsmStoreState.py +++ b/projects/hipblaslt/tensilelite/Tensile/AsmStoreState.py @@ -434,7 +434,9 @@ def getStoreElementsInfoForBatch(self, kernel, batchElements): coordOffset1 = eIdx1 * (self.kernel["WavefrontSize"] // matrixInstN) * MFMAContinuousOutputs coordOffset1 += bIdx1 * matrixInstN - coordOffset1 += wtIdex * matrixInstN * matrixInstBN * kernel["MIWaveGroup"][1] + # Subtile kernels: successive wave tiles step by MIBShape1 (not MIBShape1 * MIWaveGroup[1]). + wtStep1 = matrixInstN * matrixInstBN if kernel.get("UseSubtileImpl") else matrixInstN * matrixInstBN * kernel["MIWaveGroup"][1] + coordOffset1 += wtIdex * wtStep1 coordOffset1 = coordOffset1 * vectorWidth + vc1 else: # mac instruction if kernel["LocalSplitU"] > 1: @@ -462,7 +464,10 @@ def getStoreElementsInfoForBatch(self, kernel, batchElements): coordOffset0 = eIdx0 * (self.kernel["WavefrontSize"] // matrixInstM) * MFMAContinuousOutputs coordOffset0 += bIdx0 * matrixInstM - coordOffset0 += wtIdex * matrixInstM * matrixInstBM * kernel["MIWaveGroup"][0] + # Subtile kernels: each wave owns a contiguous block of rows, so successive + # wave tiles step by MIBShape0 (not MIBShape0 * MIWaveGroup[0]). + wtStep = matrixInstM * matrixInstBM if kernel.get("UseSubtileImpl") else matrixInstM * matrixInstBM * kernel["MIWaveGroup"][0] + coordOffset0 += wtIdex * wtStep coordOffset0 = coordOffset0 * vectorWidth + vc0 else: # mac instruction coordOffset0 = d0 * kernel["SubGroup0"]*kernel["VectorWidthA"] + vc0 @@ -828,6 +833,8 @@ def setupStoreElementsForBatch(self, kernel, gwvw, batchElements, batchElementSg if kernel["EnableMatrixInstruction"]: alignment = self.cfg.numVgprPerValuC * self.cfg.gwvw + #print(self.cfg.numVgprPerValuC, self.cfg.gwvw) + #exit(1) sumIdx = kw.vgprPool.checkOutAligned(self.cfg.numVgprPerValuC*self.cfg.gwvw, alignment, "vgprValuC") // self.cfg.numVgprPerValuC else: sumIdx = kw.states.c.startVgprValu + vc0 + d0*kernel["VectorWidthA"] + vc1*kernel["ThreadTile0"] + d1*kernel["VectorWidthA"]*kernel["ThreadTile0"] diff --git a/projects/hipblaslt/tensilelite/Tensile/ClientWriter.py b/projects/hipblaslt/tensilelite/Tensile/ClientWriter.py index 48a151f19d6..c52863d3a0b 100644 --- a/projects/hipblaslt/tensilelite/Tensile/ClientWriter.py +++ b/projects/hipblaslt/tensilelite/Tensile/ClientWriter.py @@ -215,7 +215,7 @@ def runNewClient(scriptPath, clientParametersPath, cxxCompiler: str, cCompiler: # Add MX scale format if set if globalParameters["MXScaleFormat"]: - args.append("--mx-scale-format={}".format(globalParameters["MXScaleFormat"])) + args.extend(["--mx-scale-format", str(globalParameters["MXScaleFormat"])]) try: subprocess.run(args, check=True) @@ -332,7 +332,7 @@ def writeRunScript(path, forBenchmark, enableTileSelection, cxxCompiler: str, cC clientExe = getClientExecutablePath() timingFlag = " --timing-instrumentation" if globalParameters["TimingInstrumentation"] else "" - mxScaleFormatFlag = " --mx-scale-format={}".format(globalParameters["MXScaleFormat"]) if globalParameters["MXScaleFormat"] else "" + mxScaleFormatFlag = " --mx-scale-format {}".format(globalParameters["MXScaleFormat"]) if globalParameters["MXScaleFormat"] else "" for configFile in configPaths: runScriptFile.write("{} --config-file {}{}{}\n".format(clientExe, configFile, timingFlag, mxScaleFormatFlag)) runScriptFile.write("ERR2=$?\n\n") @@ -356,7 +356,7 @@ def writeRunScript(path, forBenchmark, enableTileSelection, cxxCompiler: str, cC runScriptFile.write("%s -d 0 --resetclocks\n" % globalParameters["ROCmSMIPath"]) runScriptFile.write("%s -d 0 --setfan 50\n" % globalParameters["ROCmSMIPath"]) else: - mxScaleFormatFlag = " --mx-scale-format={}".format(globalParameters["MXScaleFormat"]) if globalParameters["MXScaleFormat"] else "" + mxScaleFormatFlag = " --mx-scale-format {}".format(globalParameters["MXScaleFormat"]) if globalParameters["MXScaleFormat"] else "" for configFile in configPaths: runScriptFile.write("{} --config-file {} --best-solution 1{}\n".format(getClientExecutablePath(), configFile, mxScaleFormatFlag)) diff --git a/projects/hipblaslt/tensilelite/Tensile/Common/GlobalParameters.py b/projects/hipblaslt/tensilelite/Tensile/Common/GlobalParameters.py index 3844831d904..63e63f1e2a9 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Common/GlobalParameters.py +++ b/projects/hipblaslt/tensilelite/Tensile/Common/GlobalParameters.py @@ -137,7 +137,7 @@ ) globalParameters["LogicFormat"] = "yaml" # set library backend (yaml, or json) globalParameters["LibraryFormat"] = "yaml" # set library backend (yaml, or msgpack) -globalParameters["MXScaleFormat"] = 0 # MX scale data format (0=none, 1=pre-swizzle for GPU kernel layout) +globalParameters["MXScaleFormat"] = 0 # MX scale data format (0=none, 1=pre-swizzle for GPU kernel layout). Only the gfx950 subtile MX kernels need the pre-swizzle; gfx1250 reads canonical scales. The two gfx950 yamls that need it set MXScaleFormat: 1 explicitly. # True/False: CSV will/won't export WinnerGFlops, WinnerTimeUS, WinnerIdx, WinnerName. # TODO - if no side-effect, we can set default to True. This can make analyzing "LibraryLogic" (AddFromCSV) faster @@ -431,6 +431,7 @@ {"BAddrInterleave": [False]}, {"KRingShift": [False]}, {"DirectToLds": [0]}, + {"UseSubtileImpl": [False]}, {"UseSgprForGRO": [-1]}, {"UseInstOffsetForGRO": [0]}, {"AssertSummationElementMultiple": [1]}, diff --git a/projects/hipblaslt/tensilelite/Tensile/Common/RequiredParameters.py b/projects/hipblaslt/tensilelite/Tensile/Common/RequiredParameters.py index b73211524c7..645f54474b6 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Common/RequiredParameters.py +++ b/projects/hipblaslt/tensilelite/Tensile/Common/RequiredParameters.py @@ -114,5 +114,6 @@ def getRequiredParametersMin() -> set: 'WorkGroup', 'DtlPlusLdsBuf', 'MinGRIncPerMfma', - 'UsePLRPack' + 'UsePLRPack', + 'UseSubtileImpl' }) diff --git a/projects/hipblaslt/tensilelite/Tensile/Common/ValidParameters.py b/projects/hipblaslt/tensilelite/Tensile/Common/ValidParameters.py index a65649ca2b9..692bf835fd6 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Common/ValidParameters.py +++ b/projects/hipblaslt/tensilelite/Tensile/Common/ValidParameters.py @@ -392,6 +392,10 @@ def makeValidMatrixInstructions(): # 2: DirectToLds A only (no DTLB) # 3: DirectToLds B only (no DTLA) "DirectToLds": [0, 1, 2, 3], + # Enable subtile-based kernel implementation for MX FP4 (gfx950 only). + # When True, uses a subtile scheduling strategy with DTL global reads and + # an optimized storeD path. Automatically forced False on non-gfx950. + "UseSubtileImpl": [False, True], # Load options: # (GRO = Global Read Offset) # BufferLoad=0: diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/ComputeStoreVgprs.py b/projects/hipblaslt/tensilelite/Tensile/Components/ComputeStoreVgprs.py index b19fa247a5d..ffd72a84144 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/ComputeStoreVgprs.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/ComputeStoreVgprs.py @@ -188,7 +188,9 @@ def __call__(self, writer, kernel): module.add(vectorStaticDivide(tmpVgpr0, wave_id, kernel["MIWaveGroup"][0], tmpVgpr1Res)) if kernel["LocalSplitU"] > 1: module.add(vectorStaticRemainder(dummy, tmpVgpr0, tmpVgpr0, kernel["MIWaveGroup"][1], tmpVgpr1Res, tmpSgprInfo)) - module.add(VMulLOU32(dst=vgpr(tid1), src0=hex(MIBShape1), src1=vgpr(tmpVgpr0), comment="wave coordination offset 1")) + # Subtile kernels: each wave owns a contiguous block of MIWaveTile[1]*MIBShape1 cols. + waveBlockCols = MIBShape1 * kernel["MIWaveTile"][1] if kernel.get("UseSubtileImpl") else MIBShape1 + module.add(vectorStaticMultiply(vgpr(tid1), vgpr(tmpVgpr0), waveBlockCols, tmpSgprInfo, "wave coordination offset 1")) # coord 1 : thread part module.add(vectorStaticRemainder(dummy, tmpVgpr0, "Serial", matrixInstN, tmpVgpr1Res, tmpSgprInfo)) @@ -209,7 +211,10 @@ def __call__(self, writer, kernel): # coord 0 : wave part module.add(vectorStaticRemainder(dummy, tmpVgpr0, wave_id, kernel["MIWaveGroup"][0], tmpVgpr1Res, tmpSgprInfo)) - module.add(VMulLOU32(dst=vgpr(tmpVgpr0), src0=hex(MIBShape0), src1=vgpr(tmpVgpr0), comment="wave coordination offset 0")) + # Subtile kernels: each wave owns a contiguous block of MIWaveTile[0]*MIBShape0 rows. + # wave_id0 * MIWaveTile[0] * MIBShape0 gives the start row of wave's block. + waveBlockRows = MIBShape0 * kernel["MIWaveTile"][0] if kernel.get("UseSubtileImpl") else MIBShape0 + module.add(vectorStaticMultiply(vgpr(tmpVgpr0), vgpr(tmpVgpr0), waveBlockRows, tmpSgprInfo, "wave coordination offset 0")) # coord 0 : thread part module.add(vectorStaticRemainder(dummy, tid0, "Serial", writer.states.kernel["WavefrontSize"], tmpVgpr1Res, tmpSgprInfo)) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/GlobalWriteBatch.py b/projects/hipblaslt/tensilelite/Tensile/Components/GlobalWriteBatch.py index 0b1a9edceee..ba51a80d989 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/GlobalWriteBatch.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/GlobalWriteBatch.py @@ -20,22 +20,24 @@ # CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ################################################################################ -from rocisa.code import Label, Module, RegSet +from rocisa.code import Label, Module, RegSet, TextBlock from rocisa.container import SMEMModifiers, VOP3PModifiers, MUBUFModifiers, \ - SDWAModifiers, replaceHolder, EXEC, VCC, vgpr, sgpr, ContinuousRegister + SDWAModifiers, replaceHolder, EXEC, EXECLO, EXECHI, VCC, vgpr, sgpr, ContinuousRegister from rocisa.enum import CvtType, HighBitSel, RoundType, SaturateCastType, SelectBit from rocisa.instruction import BufferAtomicAddF32, BufferAtomicCmpswapB32, \ - BufferAtomicCmpswapB64, FlatAtomicCmpswapB32, SAddCU32, SAddU32, SAndB32, \ + BufferAtomicCmpswapB64, BufferStoreB16, BufferStoreB32, BufferStoreB64, BufferStoreB128, DSBPermuteB32, FlatAtomicCmpswapB32, \ + SAddCU32, SAddU32, SAndB32, \ SAndB64, SAtomicDec, SBarrier, SBranch, SCBranchExecNZ, SCBranchExecZ, \ - SCBranchSCC1, SCSelectB32, SCmpEQI32, SCmpEQU32, SCmpGtI32, SCmpLeI32, \ + SCBranchSCC0, SCBranchSCC1, SCmpKGtU32, SCSelectB32, SCmpEQI32, SCmpEQU32, SCmpGtI32, SCmpLeI32, \ SLShiftLeftB32, SLShiftLeftB64, SLShiftRightB32, SMovB32, SMovB64, SMulI32, \ SNop, SOrB32, SOrB64, SOrSaveExecB32, SOrSaveExecB64, SSleep, SSubI32, SSubU32, \ SSwapPCB64, SWaitCnt, SWaitAlu, VAShiftRightI32, VAddCCOU32, VAddCOU32, VAddF32, VAddF64, \ VAddI32, VAddPKF16, VAddPKF32, VAddU32, VBfeI32, VCmpEQU32, VCmpGEI32, VCmpGtU32, \ - VCmpNeU32, VCmpNeU64, VCndMaskB32, VCvtBF8toF32, VCvtF32toI32, \ - VCvtFP8toF32, VCvtI32toF32, VCvtPkBF8toF32, VCvtPkFP8toF32, VFmaF64, VFmaMixF32, \ + VCmpNeU32, VCmpNeU64, VCndMaskB32, VCvtBF8toF32, VCvtF16toF32, VCvtF32toF16, VCvtF32toI32, \ + VCvtFP8toF32, VCvtI32toF32, VCvtPkBF8toF32, VCvtPkF32toBF16, VCvtPkF32toFP16, VCvtPkFP8toF32, \ + VFmaF64, VFmaMixF32, VAndB32, VLShiftLeftB32, VPermlane16SwapB32, VPermlane32SwapB32, \ VLShiftRightB32, VMacF32, VMadMixF32, VMaxF32, VMovB32, VMovB64, VMulF32, VMulF64, \ - VMulLOU32, VMulPKF16, VMulPKF32, VPackF16toB32, VReadfirstlaneB32, VRndneF32, VCvtBF16toFP32 + VMulLOU32, VMulPKF16, VMulPKF32, VPackF16toB32, VReadfirstlaneB32, VRndneF32, VCvtBF16toFP32, VSubU32 from rocisa.functions import vectorStaticMultiply from ..Common import DataDirection, SemanticVersion @@ -49,7 +51,7 @@ from ..Components.PackData import formatting, PackData_F16, PackData_BF16, PackData_FLOAT8, PackData_FLOAT8_fnuz from rocisa.instruction import ECvtF16toF32, ECvtPkFP8toF32, ECvtPkBF8toF32 -from math import ceil +from math import ceil, log2 class GlobalWriteBatchComponent(GlobalWriteComponents): kernel = {"ProblemType": {"OperationType": "GEMM" }} @@ -110,6 +112,15 @@ def __init__(self, kernel: Solution, tPA, tPB, activation: ActivationModule, ss: self.factorDim = factorDim self.amdClangVersion = amdClangVersion + # Stateful tracking for N-group OOB guard deduplication (_emitSubtileOobGuard). + # The outer loop iterates N-outer / M-inner, so all M elements within a fixed N + # group share the same N guard result. We emit the N s_cmp/s_cbranch only once + # per N group and skip it for subsequent M elements in the same group. + self._subtilePrevBlockIdxN = -1 # sentinel: no group seen yet + self._subtileNGroupSkipLabel = None # end-of-N-group label (M cbranch target) + self._subtileAllStoresEndLabel = None # end-of-all-stores label (N cbranch target) + self._subtileCloadPrevD1 = -1 # sentinel: last d1 group seen in C load guard + # Internal state for GlobalWriteBatch # 0 for None, 1 for WorkGroupReduction = False, 2 for WorkGroupReduction = True self.storeBiasD = 0 @@ -169,6 +180,13 @@ def emit(self) -> Module: module = Module(self.moduleName) self._prolog(module) self._emitAdd(module) + # UseSubtileImpl with bias/SAV: drain LDS reads and sync waves after alpha + # multiply to prevent cross-wave LDS corruption from ds_bpermute. + if self.kernel.get("UseSubtileImpl") and \ + (self.parentWriter.states.useBias != DataDirection.NONE or \ + self.kernel["ProblemType"].get("UseScaleAlphaVec", 0)): + module.add(SWaitCnt(dscnt=0, comment="drain bias/SAV LDS reads")) + module.add(SBarrier("sync waves before subtile paired stores")) self._epilog(module) return module @@ -190,7 +208,12 @@ def globalStoreWait(self, elementIdx, waitCnter, vlcntTotalIssued, dscntTotalIss waitLoadCnt += self.eLoadIssued[elementIdx] waitLoadCntStrList.append("%d (load E)"%self.eLoadIssued[elementIdx]) # Calculate local loads - if self.parentWriter.states.useBias == DataDirection.READ: + # UseSubtileImpl with bias/SAV: skip bias/SAV LDS loads from interleaved + # waitcnt and rely on the batch-start barrier for LDS synchronization. + subtileBarrierDrains = self.kernel.get("UseSubtileImpl") and \ + (self.parentWriter.states.useBias != DataDirection.NONE or \ + self.kernel["ProblemType"].get("UseScaleAlphaVec", 0)) + if self.parentWriter.states.useBias == DataDirection.READ and not subtileBarrierDrains: waitLocalLoadCnt += self.biasLoadIssued[elementIdx] waitLocalLoadCntStrList.append("%d (bias)"%self.biasLoadIssued[elementIdx]) if (self.kernel["ProblemType"]["UseScaleAB"] == "Vector") and isSingleKernel: @@ -198,7 +221,8 @@ def globalStoreWait(self, elementIdx, waitCnter, vlcntTotalIssued, dscntTotalIss waitLocalLoadCntStrList.append("%d (scaleAVec)"%self.scaleAVecLoadIssued[elementIdx]) waitLocalLoadCnt += self.scaleBVecLoadIssued[elementIdx] waitLocalLoadCntStrList.append("%d (scaleBVec)"%self.scaleBVecLoadIssued[elementIdx]) - if self.kernel["ProblemType"]["UseScaleAlphaVec"] and isSingleKernel: + # Skip scaleAlphaVec when subtileBarrierDrains + if self.kernel["ProblemType"]["UseScaleAlphaVec"] and isSingleKernel and not subtileBarrierDrains: waitLocalLoadCnt += self.scaleAlphaVecLoadIssued[elementIdx] waitLocalLoadCntStrList.append("%d (scaleAlphaVec)"%self.scaleAlphaVecLoadIssued[elementIdx]) # Get vlcnt and dscnt @@ -372,6 +396,7 @@ def _prolog(self, module: Module): module.add(VCvtI32toF32(dst=vgpr(srcRegName), src=vgpr(srcRegName), comment="Convert MI out reg to fp32")) module.add(rh) + loadInputCode = Module("loadInputCode") self.betaLoadIssued = [] @@ -424,6 +449,37 @@ def _prolog(self, module: Module): if self.beta: module.add(addrCalc.emitLdChange(self.kernel, self.ss, 'C', self.edge, self.beta, mask, bufferOOB, (elementIdx == 0), self.tmpVgpr, self.tmpSgpr, addrCVgpr, self.addrC, 0)) if dataBeta not in loadedDataBeta: + # In the UseSubtileImpl NonEdge path the workgroup-level edge check is relaxed + # (subtile-aligned remainder is allowed into NonEdge), so individual waves may + # own rows/columns beyond the valid output region. Gate each C load by writing + # SrdC+2 (num_records): BufferOOB → normal load, 0 → hardware returns zero. + # + # element loop is N(d1)-outer / M(d0)-inner. + # d1 (N) check: emitted once per d1 group — sets SrdC+2 = BufferOOB if N valid, else 0. + # d0 (M) check: emitted per element — overwrites SrdC+2 = SrdC+2 if M valid, else 0. + # (AND semantics: SrdC+2 = BufferOOB only when both M and N are valid.) + # d0 is monotone within each d1 group: once OOB, remaining d0s are also OOB. + mGuardSgpr = self.parentWriter.states.subtileM32ValidBlocksSgpr + nGuardSgpr = self.parentWriter.states.subtileN16ValidBlocksSgpr + if not self.edge and (mGuardSgpr is not None or nGuardSgpr is not None): + d1, d0 = element[0], element[1] + # N guard: emit once per d1 group. + if nGuardSgpr is not None and d1 != self._subtileCloadPrevD1: + module.add(SCmpKGtU32(src=sgpr("SubtileNGuard"), simm16=d1, + comment="subtile C load: numNBlocks > d1=%d?" % d1)) + module.add(SCSelectB32(dst=sgpr("SrdC+2"), src0="BufferOOB", src1=0, + comment="SrdC+2 = BufferOOB if N valid, else 0")) + self._subtileCloadPrevD1 = d1 + # M guard: emit per element, AND into SrdC+2. + if mGuardSgpr is not None: + module.add(SCmpKGtU32(src=sgpr("SubtileMGuard"), simm16=d0, + comment="subtile C load: numMBlocks > d0=%d?" % d0)) + if nGuardSgpr is not None: + module.add(SCSelectB32(dst=sgpr("SrdC+2"), src0=sgpr("SrdC+2"), src1=0, + comment="SrdC+2 = SrdC+2 if M valid, else 0 (AND with N result)")) + else: + module.add(SCSelectB32(dst=sgpr("SrdC+2"), src0="BufferOOB", src1=0, + comment="SrdC+2 = BufferOOB if M valid, else 0")) if self.kernel["GroupLoadStore"]: loadInputCode.add(self.parentWriter.readInput(self.kernel, self.ss, 'C', self.kernel["ProblemType"]["DestDataType"], addrCalc, vc0, data, self.gwvw, addrCVgpr, self.tmpS01)) else: @@ -581,6 +637,14 @@ def addEpilogueLoad(modGwvw, ldName: str, addrVecVgpr, addrVec, dataVec, loadedD module.add(loadInputCode) + # Restore SrdC+2 = BufferOOB after subtile NonEdge C-load OOB gating (which may have set it to 0). + if self.beta and not self.edge: + mGuardSgpr = self.parentWriter.states.subtileM32ValidBlocksSgpr + nGuardSgpr = self.parentWriter.states.subtileN16ValidBlocksSgpr + if mGuardSgpr is not None or nGuardSgpr is not None: + module.add(SMovB32(dst=sgpr("SrdC+2"), src="BufferOOB", + comment="restore SrdC+2 after subtile NonEdge C-load OOB gating")) + if self.beta and self.kernel["StoreSyncOpt"]: self._storeSyncOpt(module) @@ -588,6 +652,7 @@ def addEpilogueLoad(modGwvw, ldName: str, addrVecVgpr, addrVec, dataVec, loadedD # AccVgpr read if self.codeAccVgprRead is not None and (self.kernel["LocalSplitU"] == 1 or self.kernel["_GlobalAccumulation"] == "MultipleBufferSingleKernel"): regsPerScalar = self.parentWriter.states.bpeCinternal // self.parentWriter.states.bpr # register per scalar + #TODOBS: Need to change this, for LSU>1 + subtile impl case if self.kernel["MIArchVgpr"] and self.kernel["LocalSplitU"] > 1: tmpStartVgprValuC = self.parentWriter.states.c.startVgprValu self.parentWriter.states.c.startVgprValu = 0 @@ -811,7 +876,59 @@ def _emitNonatomicAdd(self, module: Module): module.add(VMovB32(vgpr(self.cvtVgprStruct.vgprBf16Mask), "0xffff0000", comment="mask for pack two bfloat16 element to 32bit" )) module.add(VMovB32(vgpr(self.cvtVgprStruct.vgprFp32Nan), "0x7fff0000", comment="fp32 Nan" )) module.add(VMovB32(vgpr(self.cvtVgprStruct.vgprBf16Inc), "0x7fff", comment="rounding bias for bfloat16" )) - elif self.kernel["ProblemType"]["DestDataType"].isFloat8_fnuz() and self.kernel["ProblemType"]["HighPrecisionAccumulate"]: + # is16bitSubtile: controls partner-lane address setup for dwordx4 paired-subtile stores. + # Must match is16bitSubtilePaired (per-element store dispatch) exactly. + # Excluded for "MultipleBufferSingleKernel" and "MultipleBuffer" (StreamK partial-tile + # workspace path) — both write float32 to workspace, not 16bit to D output. + isSubtileNonEdge = ( + self.kernel.get("UseSubtileImpl") and not self.edge + and self.kernel["_GlobalAccumulation"] not in ("MultipleBufferSingleKernel", "MultipleBuffer") + ) + is16bitSubtile = ( + isSubtileNonEdge + and (self.kernel["ProblemType"]["DestDataType"].isBFloat16() or + self.kernel["ProblemType"]["DestDataType"].isHalf()) + and self.kernel["ProblemType"]["HighPrecisionAccumulate"] + ) + if is16bitSubtile: + assert self.kernel["BufferStore"], \ + "UseSubtileImpl 16bit optimized store requires BufferStore=1" + # Compute ds_permute partner-lane address for 16bit dwordx4 paired-subtile stores. + # vtmp1 = lane_id, vtmp2 = partner_lane_id (for ds_permute forward scatter) + # After v_permlane32_swap + v_permlane16_swap + exec masking: + # each lane ends up with the lane_id of the partner that will scatter data to it. + # vPermAddr = partner_lane_id * 4 (byte address for ds_permute_b32) + vPermAddr = self.cvtVgprStruct.vgprPermAddr + vTmp = self.cvtVgprStruct.vgprBf16Temp # reuse scratch temp before it's used for mask init + module.addComment1("16bit dwordx4 UseSubtileImpl: compute ds_permute partner-lane address") + module.add(VAndB32(dst=vgpr(vTmp), src0=self.kernel["WavefrontSize"]-1, src1=vgpr("Serial"), comment="lane_id & (WS-1)")) + module.add(VAndB32(dst=vgpr(vPermAddr), src0=self.kernel["WavefrontSize"]-1, src1=vgpr("Serial"), comment="copy of lane_id")) + module.add(VPermlane32SwapB32(dst=vgpr(vTmp), src=vgpr(vTmp), comment="lane XOR 32 swap")) + module.add(SNop(waitState=0, comment="delay after v_permlane32_swap")) + module.add(VPermlane16SwapB32(dst=vgpr(vTmp), src=vgpr(vTmp), comment="lane XOR 16 swap")) + # Exec mask: lanes where both XOR swaps changed the value (i.e., the 'first' half of each pair) + # selects lanes 0-15 and 32-47 within the wave. + stmp = self.parentWriter.sgprPool.checkOutAligned(2,2) + module.add(SMovB32(dst=sgpr(stmp), src="0x0000ffff", comment="select lanes 0-15, 32-47")) + module.add(SMovB32(dst=sgpr(stmp+1), src="0xffff0000")) + module.add(VCndMaskB32(dst=vgpr(vTmp), src0=vgpr(vTmp), src1=vgpr(vPermAddr), src2=sgpr(stmp,2), comment="restore original lane_id for selected lanes")) + self.parentWriter.sgprPool.checkIn(stmp) + module.add(VLShiftLeftB32(dst=vgpr(vPermAddr), shiftHex=2, src=vgpr(vTmp), comment="partner_lane * 4 = ds_permute byte addr")) + # Pre-compute lane_group*8 once; reused as the row-byte address correction in every + # paired dwordx4 store (addrDVgpr encodes lane_group*8 but we need lane_group*16). + vLGDelta = self.cvtVgprStruct.vgprLaneGroupDelta + module.addComment1("16bit dwordx4: pre-compute lane_group*8 row-byte correction") + module.add(VAndB32(dst=vgpr(vLGDelta), src0=self.kernel["WavefrontSize"]-1, src1=vgpr("Serial"), + comment="lane_id = Serial & (WS-1)")) + module.add(VLShiftRightB32(dst=vgpr(vLGDelta), shiftHex=4, src=vgpr(vLGDelta), + comment="lane_group = lane_id >> 4")) + module.add(VLShiftLeftB32(dst=vgpr(vLGDelta), shiftHex=3, src=vgpr(vLGDelta), + comment="vgprLaneGroupDelta = lane_group * 8")) + # Compute bpe scale shift once (compile-time constant); used inside + # _emit16bitSubtilePairedStore to adjust addrDVgpr inline without + # modifying it in place, so no restore loop is needed after the stores. + elif self.kernel["_GlobalAccumulation"] != 'MultipleBuffer': + if self.kernel["ProblemType"]["DestDataType"].isFloat8_fnuz() and self.kernel["ProblemType"]["HighPrecisionAccumulate"]: module.add(VMovB32(vgpr(self.cvtVgprStruct.vgprFp8NanInf), "0x207", comment="Nan and +/- inf" )) module.add(VMovB32(vgpr(self.cvtVgprStruct.vgprFp8Max), "0x43700000", comment="Fp8 Max value 240 as float32" )) module.add(VMovB32(vgpr(self.cvtVgprStruct.vgprFp8Min), "0xc3700000", comment="Fp8 Min value -240 as float32" )) @@ -1129,10 +1246,14 @@ def applyScaleVec(vecModule, addressStr, dataScaleVec, factorDim, isGlobal=True) else: destIdx = self.ss.elementSumIdx[elementIdx] if self.kernel["ProblemType"]["DestDataType"].isHalf(): - packModule = self.packdata(self.gwvw, destIdx, self.ss.elementSumIdx[elementIdx], inputPrefix="ValuC+", prefixOffset=self.parentWriter.states.c.startVgprValu) + # For UseSubtileImpl non-edge: paired dwordx4 path handles packing in _emit16bitSubtilePairedStore. + if not is16bitSubtile: + packModule = self.packdata(self.gwvw, destIdx, self.ss.elementSumIdx[elementIdx], inputPrefix="ValuC+", prefixOffset=self.parentWriter.states.c.startVgprValu) elif self.kernel["ProblemType"]["DestDataType"].isBFloat16(): - packModule = self.packdata(self.gwvw, destIdx, self.ss.elementSumIdx[elementIdx], bf16CVTVgprStruct=self.cvtVgprStruct, - tmpS01=self.tmpS01, laneSGPRC=self.laneSGPRC, inputPrefix="ValuC+", prefixOffset=self.parentWriter.states.c.startVgprValu) + # For UseSubtileImpl non-edge: paired dwordx4 path handles packing in _emit16bitSubtilePairedStore. + if not is16bitSubtile: + packModule = self.packdata(self.gwvw, destIdx, self.ss.elementSumIdx[elementIdx], bf16CVTVgprStruct=self.cvtVgprStruct, + tmpS01=self.tmpS01, laneSGPRC=self.laneSGPRC, inputPrefix="ValuC+", prefixOffset=self.parentWriter.states.c.startVgprValu) elif self.kernel["ProblemType"]["DestDataType"].isAnyFloat8(): packModule = self.packdata(self.gwvw, destIdx, self.ss.elementSumIdx[elementIdx], fp8CVTVgprStruct=self.cvtVgprStruct, \ tmpS01=self.tmpS01, laneSGPRC=self.laneSGPRC, inputPrefix="ValuC+", prefixOffset=self.parentWriter.states.c.startVgprValu) @@ -1182,16 +1303,92 @@ def applyScaleVec(vecModule, addressStr, dataScaleVec, factorDim, isGlobal=True) module.add(packModule) if not self.kernel["StoreRemapVectorWidth"]: - if self.kernel["_GlobalAccumulation"] == "MultipleBufferSingleKernel":#GSUGSU + # 16bit UseSubtileImpl non-edge: emit paired dwordx4 stores combining sba=0 + # with sba=1 subtile data into one buffer_store_dwordx4. Works for both + # bf16 and fp16 HPA output types. + # + # UseSubtileImpl splits MIWaveTile[0] into two subtile groups: + # sba=0 owns even tt0 values (0, 2, 4, ...) + # sba=1 owns odd tt0 values (1, 3, 5, ...) + # The element list interleaves them as consecutive (even, odd) tt0 pairs: + # element 0: tt0=0 (sba=0) + # element 1: tt0=1 (sba=1) <- pair with element 0 + # element 2: tt0=2 (sba=0) (if MIWaveTile[0]>2) + # ... + # Pairing key: tt0 % 2 — even tt0 is sba=0, odd tt0 is sba=1. + storeCodeModule = storeCode if self.kernel["GroupLoadStore"] else module + if is16bitSubtile: + tt0 = element[1] # d0: thread-tile index along M + # Epilogue (bias/activation) is applied per-element in iteration order. + # The paired store must be emitted AFTER both sba=0 and sba=1 elements have + # had their epilogue applied, so we defer it to the sba=1 (odd tt0) iteration. + if tt0 % 2 == 1: + # sba=1 element (odd tt0): both sba=0 and sba=1 epilogues are done — emit paired store. + # Find the sba=0 partner: the immediately preceding element with tt0-1. + partnerElementIdx = elementIdx - 1 + partnerExists = (partnerElementIdx >= 0 and + self.batchElements[partnerElementIdx][1] == tt0 - 1) + if partnerExists: + # Paired dwordx4 store for (sba=0 at tt0-1, sba=1 at tt0). + partnerAddrCalc: AddrCalculation = self.ss.elementAddr[partnerElementIdx] + sumIdx0 = self.ss.elementSumIdx[partnerElementIdx] + sumIdx1 = self.ss.elementSumIdx[elementIdx] + prefixOffset = self.parentWriter.states.c.startVgprValu + # blockIdxM = (tt0-1) // 2: each pair of tt0 values spans one 32-row block. + blockIdxM = (tt0 - 1) // 2 + blockIdxN = element[0] # d1 = tt1 + # Early exit: skip this paired store if the wave group is outside the valid M/N tile bounds. + skipLabel = self._emitSubtileOobGuard(storeCodeModule, blockIdxM, blockIdxN, + labelPrefix="subtile_skip_store") + tmpStoreCode = self._emit16bitSubtilePairedStore(partnerAddrCalc, sumIdx0, sumIdx1, prefixOffset, tt0 - 1) + storeCodeModule.add(tmpStoreCode) + if skipLabel is not None: + storeCodeModule.add(skipLabel) + self.storesIssued += 1 + # else: no partner — the sba=0 orphan was handled as a scalar store below + else: + # sba=0 element (even tt0): emit SRD row increment if needed; store deferred to sba=1. + if self.ss.optSrdIncForRow and addrCalc.rowInc: + module.add(addrCalc.incrementToNextRow(self.kernel, "D", self.ss, self.tmpS01)) + partnerElementIdx = elementIdx + 1 + partnerExists = (partnerElementIdx < len(self.batchElements) and + self.batchElements[partnerElementIdx][1] == tt0 + 1) + if not partnerExists: + # Orphan element (no sba=1 partner in this batch): scalar 16bit store now. + # Guard against OOB wave groups (same as paired store path). + mBlockSize = self.parentWriter.states.subtileMBlockSize + blockIdxM = (tt0 * self.kernel["MatrixInstM"]) // mBlockSize + blockIdxN = element[0] + # Early exit: skip this orphan scalar store if the wave group is outside the valid M/N tile bounds. + orphanSkipLabel = self._emitSubtileOobGuard(storeCodeModule, blockIdxM, blockIdxN, + labelPrefix="subtile_skip_orphan") + sumIdx0 = self.ss.elementSumIdx[elementIdx] + prefixOffset = self.parentWriter.states.c.startVgprValu + tmpStoreCode = self._emit16bitSubtileScalarStore(addrCalc, sumIdx0, prefixOffset, tt0) + storeCodeModule.add(tmpStoreCode) + if orphanSkipLabel is not None: + storeCodeModule.add(orphanSkipLabel) + self.storesIssued += 1 + elif self.kernel["_GlobalAccumulation"] == "MultipleBufferSingleKernel":#GSUGSU tmpStoreCode = self.parentWriter.addStore(self.kernel, self.ss, 'TD', addrCalc, sumIdx, self.tmpS01, self.edge, comment="store TD not StoreRemapVectorWidth") + storeCodeModule.add(tmpStoreCode) + self.storesIssued += 1 else: + # Regular store path. If UseSubtileImpl NonEdge, guard against OOB wave groups. + skipLabel = None + if isSubtileNonEdge: + tt0 = element[1] + blockIdxM = tt0 # each tt0 maps to one mBlockSize-row block + blockIdxN = element[0] # tt1 + # Early exit: skip this store if the wave group is outside the valid M/N tile bounds. + skipLabel = self._emitSubtileOobGuard(storeCodeModule, blockIdxM, blockIdxN, + labelPrefix="subtile_skip_store") tmpStoreCode = self.parentWriter.addStore(self.kernel, self.ss, 'D', addrCalc, sumIdx, self.tmpS01, self.edge, comment="store D") - if self.kernel["GroupLoadStore"]: - storeCode.add(tmpStoreCode) - else: - module.add(tmpStoreCode) + storeCodeModule.add(tmpStoreCode) + if skipLabel is not None: + storeCodeModule.add(skipLabel) + self.storesIssued += 1 - self.storesIssued += 1 if (self.kernel["ProblemType"]["UseE"] and not self.kernel["ProblemType"]["Gradient"]) and ((self.kernel["GlobalSplitU"] == 1 or self.kernel["GlobalSplitU"] == -1) or self.kernel["StreamK"] > 0): self.storesIssued += 1 if self.storeBiasD == 1: @@ -1217,6 +1414,9 @@ def applyScaleVec(vecModule, addressStr, dataScaleVec, factorDim, isGlobal=True) if self.storeBiasD == 1: self.storesIssued += 1 + # Close the last N-group OOB skip label (if any) opened by _emitSubtileOobGuard. + self._finalizeSubtileOobGuards(storeCode if self.kernel["GroupLoadStore"] else module) + module.add(storeCode) if self.parentWriter.db["CheckStoreC"]>=0: @@ -1273,6 +1473,421 @@ def applyScaleVec(vecModule, addressStr, dataScaleVec, factorDim, isGlobal=True) module.add(SWaitCnt(vscnt=0, comment="ConservativeWaitCnt")) module.add(SBarrier("debug")) + def _emitSubtilePackedPermute(self, vPack: int, vPermAddr: int, addrWhilePermuting=None) -> Module: + """Shuffle four packed dwords across wave halves for a subtile dwordx4 store. + + After the caller packs 8 f32 accumulator values into four 16bit dwords + (vPack+0..+3), this routine performs the two-step permute that assembles + eight consecutive M-rows owned by a pair of lane-groups into a contiguous + dwordx4 payload: + + Step 1 — ds_bpermute (in-place, 4×): each lane fetches vPack+k from its + partner lane l' (= the lane at LG±1 distance, pre-encoded as a + byte address in vPermAddr). The LDS pipe latches vPermAddr at + issue time, so vPermAddr can be repurposed as soon as all four + ds_bpermute instructions are issued. + + Step 2 — v_permlane32_swap_b32 (2×): exchange (vPack+0 ↔ vPack+2) and + (vPack+1 ↔ vPack+3) across the 32-lane boundary so that lanes + 0-31 end up with rows LG*8+0..LG*8+7 in ascending order. + + The caller may supply an optional `addrWhilePermuting` callable that adds + address-preparation instructions to the same module *between* the four + ds_bpermute issues and the SWaitCnt. This overlaps address arithmetic + with the LDS round-trip latency at no extra cost. + + Args: + vPack: Base VGPR index of the four packed dwords (must be + 2-aligned to satisfy dwordx4 store alignment). + vPermAddr: VGPR holding the partner-lane byte address (pre-computed + once per batch in the vgprPermAddr slot). + addrWhilePermuting: Optional callable() that appends address instructions + to `module` while the ds_bpermute results are in-flight. + + Returns: + Module containing ds_bpermute × 4, optional address code, SWaitCnt, + and v_permlane32_swap_b32 × 2. Leaves vPack+0..+3 holding the + correctly ordered dwords ready for buffer_store_dwordx4. + """ + module = Module("SubtilePackedPermute") + + module.addComment1("ds_bpermute in-place: gather packed dwords from partner lane-group") + for k in range(4): + module.add(DSBPermuteB32(dst=vgpr(vPack+k), src0=vgpr(vPermAddr), src1=vgpr(vPack+k), + comment=f"perm dword {k}")) + + if addrWhilePermuting is not None: + addrWhilePermuting() + + module.add(SWaitCnt(dscnt=0, comment="wait for ds_bpermute (lgkmcnt=0)")) + + module.addComment1("v_permlane32_swap_b32: swap across lane-32 boundary") + module.add(VPermlane32SwapB32(dst=vgpr(vPack+0), src=vgpr(vPack+2), comment="swap dwords 0↔2")) + module.add(VPermlane32SwapB32(dst=vgpr(vPack+1), src=vgpr(vPack+3), comment="swap dwords 1↔3")) + + return module + + def _emitSubtileOobGuard(self, targetModule, blockIdxM: int, blockIdxN: int, labelPrefix: str = "subtile_skip_store"): + """Emit M/N OOB guard branches for UseSubtileImpl NonEdge stores. + + Background + ---------- + UseSubtileImpl assigns each wave a fixed subtile region of the output matrix. + In the NonEdge path the macro-tile fits entirely within the output bounds, but + individual wave groups within the macro-tile may still be out-of-bounds when + the problem size is not a multiple of the macro-tile. The SGPRs + subtileM32ValidBlocksSgpr and subtileN16ValidBlocksSgpr count how many M/N + blocks (in units of mBlockSize rows / nBlockSize columns) belong to valid + output for this wave, and are set to None when no guard is needed (edge path + or problem is tile-aligned). + + Logic + ----- + For a store at (blockIdxM, blockIdxN): + - If numValidNBlocks <= blockIdxN → N OOB: jump past ALL remaining stores. + Valid because N is monotone: subsequent N groups (blockIdxN+1, ...) are also OOB. + - If numValidMBlocks <= blockIdxM → M OOB: jump to the end of the current N group. + Valid because M is monotone: remaining M elements in this N group are also OOB. + + The N guard is emitted ONCE per N group (when blockIdxN changes). It branches to + _subtileAllStoresEndLabel (past all stores). _subtileNGroupSkipLabel marks the + boundary between N groups; M guards branch there to skip the rest of the current + N group without re-testing the remaining M elements. + Both labels are placed by _finalizeSubtileOobGuards (called after the element loop). + + Returns a per-element skip Label only when there is no N guard (M-only case); the + caller must add it after the store. Returns None in all other cases. + """ + guardMSgpr = self.parentWriter.states.subtileM32ValidBlocksSgpr + guardNSgpr = self.parentWriter.states.subtileN16ValidBlocksSgpr + # No guard SGPRs means the store is always in-bounds for this path; nothing to emit. + if guardMSgpr is None and guardNSgpr is None: + return None + + # --- N-group guard (emitted once per unique blockIdxN) --- + # Branches to _subtileAllStoresEndLabel when N OOB, skipping all remaining stores. + # Because N is monotone (blockIdxN increases each group), if this group is OOB + # then every subsequent group is also OOB — no need to test them. + if guardNSgpr is not None and blockIdxN != self._subtilePrevBlockIdxN: + # Place the previous N group's end label before starting a new group. + if self._subtileNGroupSkipLabel is not None: + targetModule.add(self._subtileNGroupSkipLabel) + self._subtileNGroupSkipLabel = None + # Create the single end-of-all-stores label on the first N group. + if self._subtileAllStoresEndLabel is None: + endLabelName = self.parentWriter.labels.getNameInc("subtile_all_stores_end") + self._subtileAllStoresEndLabel = Label(endLabelName, "end of all subtile NonEdge D stores") + nGroupEndLabelName = self.parentWriter.labels.getNameInc( + f"{labelPrefix}_N{blockIdxN}_end") + nGroupEndLabel = Label(nGroupEndLabelName, + f"end of N group blockIdxN={blockIdxN} (M cbranch target)") + targetModule.add(SCmpKGtU32(src=sgpr("SubtileNGuard"), simm16=blockIdxN, + comment=f"quick-exit: numValidNBlocks > {blockIdxN}? (OOB -> skip all stores)")) + targetModule.add(SCBranchSCC0(labelName=self._subtileAllStoresEndLabel.getLabelName(), + comment=f"quick-exit: N OOB at blockIdxN={blockIdxN}, skip all remaining stores")) + self._subtileNGroupSkipLabel = nGroupEndLabel + self._subtilePrevBlockIdxN = blockIdxN + + # --- M guard (emitted per element) --- + # Branches to end of current N group when M OOB, skipping remaining M elements. + # Because M is monotone (blockIdxM increases within the N group), if this element + # is OOB then all subsequent M elements in this N group are also OOB. + if guardMSgpr is None: + return None + targetModule.add(SCmpKGtU32(src=sgpr("SubtileMGuard"), simm16=blockIdxM, + comment=f"quick-exit: numValidMBlocks > {blockIdxM}? (OOB -> skip N group)")) + if guardNSgpr is not None and self._subtileNGroupSkipLabel is not None: + # M OOB → jump to end of this N group (no per-element label needed). + targetModule.add(SCBranchSCC0(labelName=self._subtileNGroupSkipLabel.getLabelName(), + comment=f"quick-exit: M OOB at blockIdxM={blockIdxM}, skip rest of N group")) + return None + else: + # No N guard → fall back to a per-element skip label (caller places it after the store). + skipLabelName = self.parentWriter.labels.getNameInc( + f"{labelPrefix}_M{blockIdxM}_N{blockIdxN}") + skipLabel = Label(skipLabelName, + f"skip OOB store blockIdxM={blockIdxM} blockIdxN={blockIdxN}") + targetModule.add(SCBranchSCC0(labelName=skipLabel.getLabelName(), + comment=f"quick-exit: M OOB at blockIdxM={blockIdxM}, skip store")) + return skipLabel + + def _finalizeSubtileOobGuards(self, targetModule): + """Place the pending N-group end label and end-of-all-stores label after the element loop. + + Must be called once after all elements have been emitted to close out the last + N group and anchor the N-cbranch target past all stores. + """ + if self._subtileNGroupSkipLabel is not None: + targetModule.add(self._subtileNGroupSkipLabel) + self._subtileNGroupSkipLabel = None + if self._subtileAllStoresEndLabel is not None: + targetModule.add(self._subtileAllStoresEndLabel) + self._subtileAllStoresEndLabel = None + + def _emit16bitSubtilePairedStore(self, addrCalc, sumIdx0: int, sumIdx1: int, prefixOffset: int, tt0: int = 0) -> Module: + """Emit a paired 16bit store combining sba=0 and sba=1 subtile data. + + Works for both bf16 and fp16 HPA output types. + + sba = subtile block index along A (M dimension). UseSubtileImpl iterates over + two subtile groups (sba=0, sba=1) that share the same (tt1, tt0) element + coordinates but draw from different accumulator registers. The element list + therefore contains consecutive pairs with identical (tt1, tt0): sba=0 first + (even elementIdx), sba=1 second (odd elementIdx). + + Converts 8 f32 accvgprs (4 from sba=0, 4 from sba=1) to 16bit, shuffles them + across wave halves via ds_bpermute + v_permlane32_swap_b32, then issues + 1 × buffer_store_dwordx4 at the sba=0 element's address. The cvtVgpr block + is 2-aligned (64-bit) in KWA so vgprBf16Temp satisfies the dwordx4 alignment. + + Args: + addrCalc: AddrCalculation for the sba=0 element. + sumIdx0: elementSumIdx for the sba=0 element. + sumIdx1: elementSumIdx for the sba=1 element. + prefixOffset: parentWriter.states.c.startVgprValu (offset into ValuC). + tt0: thread-tile M index (same for both sba=0 and sba=1). + """ + module = Module("16bitSubtilePairedStore") + isFp16 = self.kernel["ProblemType"]["DestDataType"].isHalf() + + ntd = self.kernel["NonTemporalD"] + isGlc = bool(ntd & 0x1) + isSlc = bool(ntd & 0x2) + isNT = bool(ntd & 0x4) + + # Reuse cvtVgprStruct.vgprBf16Temp..vgprBf16Inc (+0..+3) as 4 scratch vgprs. + # The cvtVgpr block is allocated with 2-alignment (64-bit aligned) in KWA so that + # vgprBf16Temp is at an even VGPR index, satisfying buffer_store_dwordx4's + # alignment requirement. The +0..+3 slots are safely overwritten here as pack/perm + # staging for each pair. + vPack = self.cvtVgprStruct.vgprBf16Temp # +0..3: packed 16bit dwords, 2-aligned + + vPermAddr = self.cvtVgprStruct.vgprPermAddr + vLGDelta = self.cvtVgprStruct.vgprLaneGroupDelta + vAddrScratch = self.cvtVgprStruct.vgprAddrScratch + addrDVgpr = addrCalc.addrDVgpr + + typeStr = "fp16" if isFp16 else "bf16" + VCvtPkF32to16 = VCvtPkF32toFP16 if isFp16 else VCvtPkF32toBF16 + module.addComment1(f"{typeStr} paired dwordx4 store tt0={tt0} (sba=0+sba=1): pack 8 f32 accvgprs -> 4 {typeStr} dwords") + + # Pack sba=0 subtile: ValuC+sumIdx0+{0,1} → vPack+0; ValuC+sumIdx0+{2,3} → vPack+1 + # Pack sba=1 subtile: ValuC+sumIdx1+{0,1} → vPack+2; ValuC+sumIdx1+{2,3} → vPack+3 + def vc(sumIdx, vi): + idx = sumIdx + vi - prefixOffset + return vgpr("ValuC+" + str(idx)) + + def packF32pair(dst, src0, src1, comment): + """Pack two f32 VGPRs into one dword of two 16bit values.""" + module.add(VCvtPkF32to16(dst=vgpr(dst), src0=src0, src1=src1, comment=f"{comment} -> {typeStr}")) + + packF32pair(vPack+0, vc(sumIdx0, 0), vc(sumIdx0, 1), f"sba=0 tt0={tt0}[0:1]") + packF32pair(vPack+1, vc(sumIdx0, 2), vc(sumIdx0, 3), f"sba=0 tt0={tt0}[2:3]") + packF32pair(vPack+2, vc(sumIdx1, 0), vc(sumIdx1, 1), f"sba=1 tt0={tt0}[0:1]") + packF32pair(vPack+3, vc(sumIdx1, 2), vc(sumIdx1, 3), f"sba=1 tt0={tt0}[2:3]") + + # Compute adjusted D address into vgprAddrScratch while ds_bpermute results are in-flight. + # addrDVgpr holds the M-byte offset in bpeCexternal units; scale to bpeCexternalGSU1 + # (16bit=2 bytes) then add lane_group*8 so the dwordx4 store lands at the correct row. + # addrDVgpr and vgprPermAddr are left unchanged — vgprAddrScratch is dedicated scratch + # for this purpose so no restore is needed. + bpeCurr = self.parentWriter.states.bpeCexternal + bpeDest = self.parentWriter.states.bpeCexternalGSU1 + globalOffset = addrCalc.globalOffset * bpeDest // bpeCurr + addrScaleShift = int(log2(bpeCurr // bpeDest)) if bpeCurr > bpeDest else 0 + + def emitAddrWhilePermuting(): + """Compute vAddrScratch overlapped with the in-flight ds_bpermute.""" + if addrScaleShift: + module.add(VLShiftRightB32(dst=vgpr(vAddrScratch), shiftHex=addrScaleShift, + src=vgpr(addrDVgpr), comment=f"scale addrDVgpr bpe {bpeCurr}->{bpeDest}")) + module.add(VAddU32(dst=vgpr(vAddrScratch), src0=vgpr(vAddrScratch), src1=vgpr(vLGDelta), + comment="adjusted D addr = scaled addrDVgpr + lane_group*8")) + else: + module.add(VAddU32(dst=vgpr(vAddrScratch), src0=vgpr(addrDVgpr), src1=vgpr(vLGDelta), + comment="adjusted D addr = addrDVgpr + lane_group*8")) + + module.add(self._emitSubtilePackedPermute(vPack, vPermAddr, addrWhilePermuting=emitAddrWhilePermuting)) + + module.addComment1("buffer_store_dwordx4: write 8 16bit values (4 dwords, 2-aligned src)") + module.add(BufferStoreB128( + src=vgpr(vPack, 4), + vaddr=vgpr(vAddrScratch), + saddr=sgpr("SrdD", 4), + soffset=0, + mubuf=MUBUFModifiers(offen=True, offset12=globalOffset, glc=isGlc, slc=isSlc, nt=isNT), + comment=f"16bit paired dwordx4 store tt0={tt0},{tt0+1}" + )) + + # WAR hazard: buffer_store_dwordx4 reads vPack[0:3] as source operands. + # The next paired store's v_cvt_pk_bf16_f32 will overwrite vPack. + # Insert nop to ensure the store has latched its source VGPRs. + module.add(SNop(waitState=0, comment="1 wait state: WAR hazard between store src and next pack dst")) + + return module + + def _emit16bitSubtileScalarStore(self, addrCalc, sumIdx0: int, prefixOffset: int, tt0: int = 0) -> Module: + """Emit a 16bit store for an orphan sba=0 subtile with no sba=1 partner. + + sba = subtile block index along A (M dimension). Used when MIWaveTile[0] is + odd and the last sba=0 element has no sba=1 partner. + + The layout below is specific to the mfma instruction used here: lane l = LG*16 + r + owns 4 output values at M-rows (LG*4 + 0..3) and a single N-column + (l % 16 = r = lane_id & 15). In column-major (row-first in memory) layout + these 4 values ARE contiguous + in memory (consecutive M-rows at fixed N-col), so we use 2x buffer_store_dwordx2 + after packing all 4 16bit values into 2 dwords. + + The per-lane vaddr encodes: + vaddr = (lane_id & 15) * StrideD1J * bpe [N-col byte offset within wave tile] + + vLGDelta [LG*4 M-rows * bpe = LG*8 bytes] + + wg0*MT0*bpe [workgroup M byte base] + + waveId0 * waveM_stride * bpe [M-wave byte offset within WG] + + waveId1 * waveN_stride * StrideD1J * bpe [N-wave byte offset] + and a constant offset12 = globalOffset (encodes d0 M-tile position within wave). + + The SRD base encodes only wg1*MT1*StrideD1J*bpe (N workgroup offset). + The M workgroup offset (wg0*MT0*bpe) and wave-within-WG offsets must be + included in the vaddr explicitly. + + Args: + addrCalc: AddrCalculation for the element. + sumIdx0: elementSumIdx for the element. + prefixOffset: parentWriter.states.c.startVgprValu (offset into ValuC). + """ + module = Module("16bitSubtileScalarStore") + isFp16 = self.kernel["ProblemType"]["DestDataType"].isHalf() + + ntd = self.kernel["NonTemporalD"] + isGlc = bool(ntd & 0x1) + isSlc = bool(ntd & 0x2) + isNT = bool(ntd & 0x4) + + # Scratch vgprs from the cvtVgprStruct block (overwritten each call): + # vPack+0 : 16bit packed dword (vc=0,1) + # vPack+1 : wave ID scratch / 16bit packed dword (vc=2,3) + # vPack+2 : per-lane vaddr (N-col byte offset + M offsets) + # vPack+3 : temp for N-col byte offset computation + vPack = self.cvtVgprStruct.vgprBf16Temp + vLGDelta = self.cvtVgprStruct.vgprLaneGroupDelta # LG*4*bpe = LG*8 bytes (pre-computed) + + # addrCalc.globalOffset was computed with bpeCexternal (may be 4 for _GlobalAccumulation kernels), + # but the 16bit orphan store always targets the final 16bit output (bpeCexternalGSU1=2). + bpeCurr = self.parentWriter.states.bpeCexternal + bpe = self.parentWriter.states.bpeCexternalGSU1 # always 2 for 16bit dest + globalOffset = addrCalc.globalOffset * bpe // bpeCurr + + def vc(vi): + idx = sumIdx0 + vi - prefixOffset + return vgpr("ValuC+" + str(idx)) + + # Derive the D-stride sgpr name (e.g. "StrideDJ") the same way incrementToNextRow does. + packedC1 = self.kernel["PackedC1IndicesX"] + indexChar = self.parentWriter.states.indexChars[packedC1[0]] + strideD1J = "StrideD%s" % indexChar + + ws = self.kernel["WavefrontSize"] + miwg0 = self.kernel["MIWaveGroup"][0] + miwg1 = self.kernel["MIWaveGroup"][1] + matM = self.kernel["MatrixInstM"] + matN = self.kernel["MatrixInstN"] + + typeStr = "fp16" if isFp16 else "bf16" + VCvtPkF32to16 = VCvtPkF32toFP16 if isFp16 else VCvtPkF32toBF16 + module.addComment1(f"{typeStr} orphan subtile tt0={tt0}: pack 4 M-rows (vc=0..3) at fixed N-col, store as 2x dwordx2") + + # Build per-lane vaddr: + # vaddr = (lane_id & 15) * StrideD1J * bpe [N-col] + # + vLGDelta [LG*4 M-rows = LG*8 bytes] + # + wg0*MT0*bpe [M-WG base] + # + waveId0*waveM_stride*bpe [M-wave offset, if miwg0>1] + # + waveId1*waveN_stride*StrideD1J*bpe [N-wave offset, if miwg1>1] + # The SRD already encodes wg1*MT1*StrideD1J*bpe (N-WG offset). + tmpS = self.tmpS01 + mt0bpe = self.kernel["MacroTile0"] * bpe + + module.addComment1("compute per-lane orphan vaddr = N_col_off + LG_M_off + wg0_M_off [+ wave offsets]") + + # N-col byte offset: (lane_id & 15) * StrideD1J * bpe + module.add(VAndB32(dst=vgpr(vPack+2), src0=15, src1=vgpr("Serial"), + comment="col_in_wave = lane_id & 15 (N-column index)")) + module.add(VMulLOU32(dst=vgpr(vPack+3), src0=vgpr(vPack+2), src1=sgpr(strideD1J), + comment="col_in_wave * StrideD1J")) + if bpe == 2: + module.add(VLShiftLeftB32(dst=vgpr(vPack+2), shiftHex=1, src=vgpr(vPack+3), + comment="N_col_off = col_in_wave * StrideD1J * 2")) + else: + module.add(VMulLOU32(dst=vgpr(vPack+2), src0=vgpr(vPack+3), src1=bpe, + comment="N_col_off = col_in_wave * StrideD1J * bpe")) + + # Add LG M-row offset: vLGDelta = LG*4*bpe = LG*8 bytes (pre-computed at batch start) + module.add(VAddU32(dst=vgpr(vPack+2), src0=vgpr(vPack+2), src1=vgpr(vLGDelta), + comment="vaddr += LG_M_off (= vLGDelta = LG*4*bpe)")) + + # Add M-WG offset: wg0 * MT0 * bpe + module.add(SMulI32(dst=sgpr(tmpS), src0=sgpr("WorkGroup0"), src1=mt0bpe, + comment="wg0_M_off = WorkGroup0 * MT0 * bpe")) + module.add(VAddU32(dst=vgpr(vPack+2), src0=vgpr(vPack+2), src1=sgpr(tmpS), + comment="vaddr += wg0_M_off")) + + # Add M-wave offset: waveId0 * MIWaveTile[0] * matM * bpe. + if miwg0 > 1: + wsLog2 = int(log2(ws)) + waveM_stride_bpe = self.kernel["MIWaveTile"][0] * matM * bpe + module.add(VLShiftRightB32(dst=vgpr(vPack+3), shiftHex=wsLog2, src=vgpr("Serial"), + comment=f"waveId = Serial >> {wsLog2}")) + if miwg0 & (miwg0 - 1) == 0: # power of 2 — use AND mask + module.add(VAndB32(dst=vgpr(vPack+3), src0=miwg0 - 1, src1=vgpr(vPack+3), + comment=f"waveId0 = waveId & {miwg0-1}")) + else: + raise NotImplementedError(f"Non-power-of-2 MIWaveGroup[0]={miwg0} not supported in orphan store") + module.add(SMovB32(dst=sgpr(tmpS), src=waveM_stride_bpe, + comment=f"waveM_stride_bpe={waveM_stride_bpe}")) + module.add(VMulLOU32(dst=vgpr(vPack+3), src0=vgpr(vPack+3), src1=sgpr(tmpS), + comment=f"wave_M_off = waveId0 * {waveM_stride_bpe}")) + module.add(VAddU32(dst=vgpr(vPack+2), src0=vgpr(vPack+2), src1=vgpr(vPack+3), + comment="vaddr += wave_M_off")) + + # Add N-wave offset: waveId1 * MIWaveTile[1] * matN * StrideD1J * bpe. + if miwg1 > 1: + wsLog2 = int(log2(ws)) + waveN_stride_bpe = self.kernel["MIWaveTile"][1] * matN * bpe + module.add(VLShiftRightB32(dst=vgpr(vPack+3), shiftHex=wsLog2, src=vgpr("Serial"), + comment=f"waveId = Serial >> {wsLog2}")) + if miwg0 & (miwg0 - 1) == 0: # miwg0 is power of 2 + module.add(VLShiftRightB32(dst=vgpr(vPack+3), shiftHex=int(log2(miwg0)), + src=vgpr(vPack+3), + comment=f"waveId1 = waveId / {miwg0}")) + else: + raise NotImplementedError(f"Non-power-of-2 MIWaveGroup[0]={miwg0} not supported in orphan store") + module.add(SMovB32(dst=sgpr(tmpS), src=waveN_stride_bpe, + comment=f"waveN_stride_bpe={waveN_stride_bpe}")) + module.add(VMulLOU32(dst=vgpr(vPack+3), src0=vgpr(vPack+3), src1=sgpr(tmpS), + comment=f"waveId1 * {waveN_stride_bpe}")) + module.add(VMulLOU32(dst=vgpr(vPack+3), src0=vgpr(vPack+3), src1=sgpr(strideD1J), + comment=f"wave_N_off = waveId1 * {waveN_stride_bpe} * StrideD1J")) + module.add(VAddU32(dst=vgpr(vPack+2), src0=vgpr(vPack+2), src1=vgpr(vPack+3), + comment="vaddr += wave_N_off")) + + # Pack all 4 16bit values (consecutive M-rows at fixed N-col) into 2 dwords. + # vc=0 → M-row+0 (lo16 of dword0), vc=1 → M-row+1 (hi16 of dword0) + # vc=2 → M-row+2 (lo16 of dword1), vc=3 → M-row+3 (hi16 of dword1) + # + module.add(VCvtPkF32to16(dst=vgpr(vPack+0), src0=vc(0), src1=vc(1), comment=f"M-row+0/+1 -> {typeStr}")) + module.add(VCvtPkF32to16(dst=vgpr(vPack+1), src0=vc(2), src1=vc(3), comment=f"M-row+2/+3 -> {typeStr}")) + module.add(SNop(waitState=0, comment=f"delay after pk_{typeStr}")) + module.addComment1(f"buffer_store_b64: write 4 {typeStr} M-rows at fixed N-col (orphan subtile)") + module.add(BufferStoreB64( + src=vgpr(vPack+0, 2), + vaddr=vgpr(vPack+2), + saddr=sgpr("SrdD", 4), + soffset=0, + mubuf=MUBUFModifiers(offen=True, offset12=globalOffset, glc=isGlc, slc=isSlc, nt=isNT), + comment=f"orphan tt0={tt0} vc=0..3: 4 consecutive M-rows at fixed N-col" + )) + return module + def _emitAtomicAdd(self, module: Module): ######################################## # first attempt write diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/LSU.py b/projects/hipblaslt/tensilelite/Tensile/Components/LSU.py index b843f39bd47..68416f22bd4 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/LSU.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/LSU.py @@ -594,7 +594,8 @@ def globalWrite(self, writer, kernel, tPA, tPB): noGSUBranch = (kernel["GlobalSplitU"] == 0 and kernel["StreamK"] != 3) module = Module("localSplitUGlobalWrite") - module.add(writer.globalWriteElements(kernel, tPA, tPB, vectorWidths, vectorWidths_1, elements_f0, elements_f1, noGSUBranch=noGSUBranch)) + storeModule, _ = writer.globalWriteElements(kernel, tPA, tPB, vectorWidths, vectorWidths_1, elements_f0, elements_f1, noGSUBranch=noGSUBranch) + module.add(storeModule) writer.cleanupGlobalWrite(kernel) writer.vgprPool.checkIn(self.accVgprLdsReduction) return module diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py index 7c4c4b53cf2..7f02916c89e 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py @@ -29,10 +29,12 @@ SCmpGtU32, SCmpLeU32, SCmpLtU32, SLShiftLeftB32, SLShiftLeftB64, SLShiftRightB32, SLoadB32, \ SMaxI32, SMinU32, SMovB32, SMovB64, SMulI32, SNop, SOrB32, SSleep, SStoreB32, SSubU32, \ SWaitCnt, VAddF32, VAddF64, VAddPKF16, VAddU32, VLShiftRightB32, VMovB32, \ - VReadfirstlaneB32, VCvtBF16toFP32, BufferLoadB32, BufferStoreB32 + VReadfirstlaneB32, VCvtBF16toFP32, BufferLoadB32, BufferStoreB32, \ + SLongBranch, SLongBranchPositive from rocisa.functions import scalarStaticDivideAndRemainder, sMagicDiv2, \ vectorStaticMultiply, BranchIfNotZero, scalarUInt24DivideAndRemainder, scalarUInt32DivideAndRemainder +from .SubtileBasedKernel import localReadResetOffsetsSubtile from ..Common import print2, ceilDivide, log2 from ..Component import Component @@ -257,10 +259,18 @@ def _depthUForTc(kernel, tc): For MX scale tensors, DepthU is divided by the MX block size because there is one scale element per MXBlock data elements. + For MXSA/MXSB (MX swizzled/pre-shuffle case), the swizzled block size + is 32 * 256 so an additional *32 multiplier is needed. """ key = "_DepthU%s" % tc if key in kernel: - return kernel[key] + _DepthU = kernel[key] + if tc in ("MXSA", "MXSB") and kernel.get("UseSubtileImpl"): + # UseSubtileImpl MX swizzled(pre shuffle) case: swizzled block size is 32 * 256, + # so the effective K stride for the scale tensor is DepthU * 32. + # Non-subtile MX kernels use the raw _DepthU (scale elements per tile in K). + _DepthU = (_DepthU * 32) + return _DepthU return kernel["DepthU"] def shiftSrd(self, writer, srdIdx) -> Module: @@ -314,12 +324,15 @@ def skTileIndex(self, writer, kernel, sTmp, tPA, tPB): # Always reset pointers to handle odd-exit case which moves LRO to the upper bank if kernel["PrefetchGlobalRead"]: # not self.prefetchAcrossPersistent - module.add(writer.localReadResetOffsets(kernel, tPA)) - if kernel["ProblemType"]["MXBlockA"] and "MX" in tPA: - module.add(writer.localReadResetOffsets(kernel, tPA["MX"])) - if kernel["ProblemType"]["MXBlockB"] and "MX" in tPB: - module.add(writer.localReadResetOffsets(kernel, tPB["MX"])) - module.add(writer.localReadResetOffsets(kernel, tPB)) + if not kernel["UseSubtileImpl"]: + module.add(writer.localReadResetOffsets(kernel, tPA)) + if kernel["ProblemType"]["MXBlockA"] and "MX" in tPA: + module.add(writer.localReadResetOffsets(kernel, tPA["MX"])) + if kernel["ProblemType"]["MXBlockB"] and "MX" in tPB: + module.add(writer.localReadResetOffsets(kernel, tPB["MX"])) + module.add(writer.localReadResetOffsets(kernel, tPB)) + else: + module.add(localReadResetOffsetsSubtile(writer, kernel)) module.addComment0("StreamK calculate tile idx and map to WG") @@ -841,7 +854,34 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele writer.sgprPool.checkIn(tmpSgpr) fixupEdge = [False] # Test no edge variant - module.add(self.fixupStep(writer, kernel, vectorWidths, elements, fixupEdge, tmpVgpr, cvtVgprStruct, sCtaIdx)) + # Fixup writes to workspace (no bias LDS barriers), safe to defer. + deferFixup = ( + kernel.get("UseSubtileImpl") + ) + if deferFixup: + fixupDeferredLabel = Label(label=writer.labels.getNameInc("Fixup_E0_Deferred"), comment="") + fixupReturnLabel = Label(label=writer.labels.getNameInc("Fixup_E0_Deferred_Return"), comment="") + # Keep original Fixup_E0 label inline as a stub + fixupInlineLabel = Label(label=writer.labels.getNameInc("Fixup_E%u" % 0), comment="") + module.add(fixupInlineLabel) + with writer.allocTmpSgpr(3) as tmpSgprInfo: + module.add(SLongBranchPositive(fixupDeferredLabel, tmpSgprInfo, comment="jump to deferred fixup block")) + module.addComment0("=" * 60) + module.addComment0(" Fixup block deferred to after persistent loop") + module.addComment0(" (would have been inline here in non-deferred version)") + module.addComment0("=" * 60) + module.add(fixupReturnLabel) + # Collect fixup code in deferred module + fixupModule = Module("Fixup_DeferredBlock") + fixupModule.add(fixupDeferredLabel) + fixupModule.add(self.fixupStep(writer, kernel, vectorWidths, elements, fixupEdge, tmpVgpr, cvtVgprStruct, sCtaIdx)) + with writer.allocTmpSgpr(3) as tmpSgprInfo: + posLabel = writer.labels.getNameInc("FixupDeferredReturnDir") + fixupModule.add(SLongBranch(fixupReturnLabel, tmpSgprInfo, posLabel, comment="return from deferred fixup block")) + writer.states.deferredFixupModule = fixupModule + else: + fixupModule = None + module.add(self.fixupStep(writer, kernel, vectorWidths, elements, fixupEdge, tmpVgpr, cvtVgprStruct, sCtaIdx)) if kernel["StreamK"] >= 2: sSkExtraIters = writer.sgprPool.checkOut(1, "extraIters") @@ -904,14 +944,44 @@ def writePartialsCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele with self.allocTmpSgpr(4) as tmpSgprInfo: module.add(writer.checkIsEdge(kernel, tmpSgprInfo, partialsLabels[True], partialsLabels[True])) - for edge in edges: - module.add(partialsLabels[edge]) - sIdx = writer.acquireStreamKConstSgpr(kernel, "StreamKIdx") - if writer.isStreamKConstantsToVgprEnabled(kernel): - module.add(VReadfirstlaneB32(dst=sgpr(sIdx), src=vgpr(writer.states.skConstVgprs["StreamKIdx"]))) - module.add(self.computeWorkspaceSrd(writer, kernel, sgpr(sIdx))) - writer.releaseStreamKConstSgpr(sIdx) - module.add(self.partialsWriteProcedure(writer, kernel, vectorWidths, elements, False, False, edge, tmpVgpr, cvtVgprStruct, endLabel)) + # WritePartials writes to workspace (no bias LDS barriers), safe to defer. + deferPartials = ( + kernel.get("UseSubtileImpl") + ) + if deferPartials: + partialsDeferredLabel = Label(label=writer.labels.getNameInc("GW_Partials_E0_Deferred"), comment="") + partialsReturnLabel = Label(label=writer.labels.getNameInc("GW_Partials_E0_Deferred_Return"), comment="") + # Inline stub + for edge in edges: + module.add(partialsLabels[edge]) + with writer.allocTmpSgpr(3) as tmpSgprInfo: + module.add(SLongBranchPositive(partialsDeferredLabel, tmpSgprInfo, comment="writePartials (deferred)")) + module.addComment0("=" * 60) + module.addComment0(" WritePartials block deferred to after persistent loop") + module.addComment0(" (would have been inline here in non-deferred version)") + module.addComment0("=" * 60) + module.add(partialsReturnLabel) + module.add(SBranch(labelName=endLabel.getLabelName(), comment="jump to end")) + # Deferred block + partialsModule = Module("Partials_DeferredBlock") + partialsModule.add(partialsDeferredLabel) + for edge in edges: + sIdx = writer.acquireStreamKConstSgpr(kernel, "StreamKIdx") + if writer.isStreamKConstantsToVgprEnabled(kernel): + partialsModule.add(VReadfirstlaneB32(dst=sgpr(sIdx), src=vgpr(writer.states.skConstVgprs["StreamKIdx"]))) + partialsModule.add(self.computeWorkspaceSrd(writer, kernel, sgpr(sIdx))) + writer.releaseStreamKConstSgpr(sIdx) + partialsModule.add(self.partialsWriteProcedure(writer, kernel, vectorWidths, elements, False, False, edge, tmpVgpr, cvtVgprStruct, partialsReturnLabel)) + writer.states.deferredPartialsModule = partialsModule + else: + for edge in edges: + module.add(partialsLabels[edge]) + sIdx = writer.acquireStreamKConstSgpr(kernel, "StreamKIdx") + if writer.isStreamKConstantsToVgprEnabled(kernel): + module.add(VReadfirstlaneB32(dst=sgpr(sIdx), src=vgpr(writer.states.skConstVgprs["StreamKIdx"]))) + module.add(self.computeWorkspaceSrd(writer, kernel, sgpr(sIdx))) + writer.releaseStreamKConstSgpr(sIdx) + module.add(self.partialsWriteProcedure(writer, kernel, vectorWidths, elements, False, False, edge, tmpVgpr, cvtVgprStruct, endLabel)) return module @@ -1064,6 +1134,15 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha, else: numElementsPerBatch = len(elements[edgeI]) # max, do 'em all + # Cap batch size to align on MIWaveTile[0] boundaries (see refineOccupancy). + if kernel.get("UseSubtileImpl") and kernel.get("EnableMatrixInstruction"): + miwt0 = kernel["MIWaveTile"][0] + totalElems = kernel["MIWaveTile"][0] * kernel["MIWaveTile"][1] + if numElementsPerBatch >= totalElems: + numElementsPerBatch = totalElems + elif miwt0 > 1 and numElementsPerBatch >= miwt0: + numElementsPerBatch = (numElementsPerBatch // miwt0) * miwt0 + # assert(writer.states.numVgprValuC % gwvw == 0) # sanity check numElementsPerBatch = numElementsPerBatch if not kernel["NumElementsPerBatchStore"] else min(kernel["NumElementsPerBatchStore"],numElementsPerBatch) @@ -1170,7 +1249,12 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha, module.add(skipFlagSet) module.add(SWaitCnt(kmcnt=0, comment="wait for flag")) # TODO just for testing - module.add(SBranch(labelName=endLabel.getLabelName(), comment="jump to end")) + if "Deferred" in endLabel.getLabelName(): + posLabel = writer.labels.getNameInc("PartialsDeferredReturnDir") + with writer.allocTmpSgpr(3) as tmpSgprInfo: + module.add(SLongBranch(endLabel, tmpSgprInfo, posLabel, comment="jump to end")) + else: + module.add(SBranch(labelName=endLabel.getLabelName(), comment="jump to end")) # Finish one write path, reset currPreLoopVmcntCase to Undefined # self.currPreLoopVmcntCase = PreLoopVmcntCase.Undefined @@ -1286,7 +1370,8 @@ def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edg for vi in range(0, gwvw): # loop over registers within one scalar for rIdx in range(0, regsPerScalar): - module.add(replaceHolder(codeAccVgprRead.popFirstItem(), ss.elementSumIdx[elementIdx]*regsPerScalar + regsPerScalar*vi + rIdx - writer.states.c.startVgprValu)) + startVgprValuOffset = 0 if kernel.get("UseSubtileImpl") else writer.states.c.startVgprValu + module.add(replaceHolder(codeAccVgprRead.popFirstItem(), ss.elementSumIdx[elementIdx]*regsPerScalar + regsPerScalar*vi + rIdx - startVgprValuOffset)) # if kernel["StoreCInUnroll"] and not edge: # tempStr = tempStr.replace("__placeholder__",str(elementIdx*gwvw*regsPerScalar + regsPerScalar*vi + rIdx)) # accVgprRead.addCode(tempStr.replace("ValuC","L2GC")) @@ -1354,8 +1439,14 @@ def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edg element = batchElements[elementIdx] addrCalc: AddrCalculation = ss.elementAddr[elementIdx] addr = addrCalc.addrDVgpr - sumIdx = ss.elementSumIdx[elementIdx] - + # For UseSubtileImpl, vgprValuC is remapped; add the base offset so the + # WS store reads from the correct accumulator VGPRs. For the regular path + # (non-subtile), startVgprValu is already accounted for by the vgprValuC + # assembler macro, so no offset is needed (matches rebase behaviour). + if kernel.get("UseSubtileImpl"): + sumIdx = ss.elementSumIdx[elementIdx] + writer.states.c.startVgprValu + else: + sumIdx = ss.elementSumIdx[elementIdx] storeWidth = kernel["StoreVectorWidth"] # storeWidth = 2 if batchIdx == 0 and elementIdx == 0: diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedInstructionEmitter.py b/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedInstructionEmitter.py new file mode 100644 index 00000000000..05e3556af79 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedInstructionEmitter.py @@ -0,0 +1,216 @@ +"""Instruction emitter for LogicalScheduler. + +Converts the logical schedule (EmittedModule chains) into concrete GPU +instructions by dispatching each opType to its emit method. +""" + +from __future__ import annotations + +from Tensile.Components.SubtileBasedKernel import ( + emitMfmaInstruction, emitSingleDsRead, emitSingleBufferLoad, + globalReadPtrUpdates, globalReadLDSBufferSwap, + localReadLDSBufferSwap, + globalReadDoScaleSubtile, globalReadScalePtrUpdates, +) +from rocisa.code import Module +from rocisa.instruction import SWaitCnt, SBarrier, DSLoadB32, SCmpEQU32, SCmpLeU32, SCBranchSCC1 +from rocisa.container import vgpr, sgpr, DSModifiers +from rocisa.code import Label + + +class InstructionEmitter: + """Emits GPU instructions for each opType in the LogicalScheduler output. + + VGPR tile indexing uses placement-level tile maps (tileId → vgprTileId) + set by assign_vgpr_tiles(). Per-tensor VGPR tile lists are indexed by + vgprTileId. All tensors (A, B, SA, SB) use the same tile-map approach. + """ + + def __init__(self, writer, kernel, config, + tileInfoA, tileInfoB, dtileInfo, + vgprTilesA, vgprTilesB, + scaleTileInfoA=None, scaleTileInfoB=None, + vgprTilesSA=None, vgprTilesSB=None): + self.writer = writer + self.kernel = kernel + self.config = config + self.tileInfoA = tileInfoA + self.tileInfoB = tileInfoB + self.dtileInfo = dtileInfo + self.vgprTilesA = vgprTilesA + self.vgprTilesB = vgprTilesB + self.vgprTilesSA = vgprTilesSA or [] + self.vgprTilesSB = vgprTilesSB or [] + + # Derived state + self.hasScale = scaleTileInfoA is not None and scaleTileInfoB is not None + self.subtileShapeK = tileInfoA.subtileShape[1] + self.tileInfoMap = {'A': tileInfoA, 'B': tileInfoB} + if self.hasScale: + self.tileInfoMap['SA'] = scaleTileInfoA + self.tileInfoMap['SB'] = scaleTileInfoB + + # Dispatch table — unroll_iter is passed for mfma/lr + self._dispatch = { + 'mfma': lambda em, ui: self.emit_mfma(em.source, ui), + 'lr': lambda em, ui: self.emit_lr(em.source, ui), + 'gr': lambda em, ui: self.emit_gr(em.source), + 'wait_gr': lambda em, ui: self.emit_wait_gr(em.source), + 'wait_lr': lambda em, ui: self.emit_wait_lr(), + 'sync': lambda em, ui: self.emit_sync(), + 'lr_inc': lambda em, ui: self.emit_lr_inc(em.source), + 'gr_inc': lambda em, ui: self.emit_gr_inc(em.source), + 'skip': lambda em, ui: self.emit_skip(em.source), + } + + def emit_mfma(self, placement, unroll_iter=0): + """Emit MFMA instructions from MFMAPlacement.""" + module = Module() + subIterK = placement.subIterK + tile_maps = {t: placement.vgpr_tile_maps[t][unroll_iter] + for t in placement.vgpr_tile_maps} + + for a in placement.tileA.tileId_list: + for b in placement.tileB.tileId_list: + groupA = (a // self.config.lrA.mn) * self.config.lrA.mn + groupB = (b // self.config.lrB.mn) * self.config.lrB.mn + aTile = self.vgprTilesA[tile_maps['A'][groupA]] + bTile = self.vgprTilesB[tile_maps['B'][groupB]] + dTile = self.dtileInfo.vgprTiles[a + b * self.dtileInfo.localMMATileGrid[0]] + + if self.hasScale: + scaleGroupA = (a // self.config.lrSA.mn) * self.config.lrSA.mn + scaleGroupB = (b // self.config.lrSB.mn) * self.config.lrSB.mn + scaleATile = self.vgprTilesSA[tile_maps['SA'][scaleGroupA]] + scaleBTile = self.vgprTilesSB[tile_maps['SB'][scaleGroupB]] + scaleAVgpr = next(iter(scaleATile)) + scaleBVgpr = next(iter(scaleBTile)) + sAsel = (a % 2) + 2 * subIterK + sBsel = (b % 2) + 2 * subIterK + else: + scaleAVgpr = scaleBVgpr = -1 + sAsel = sBsel = 0 + + module.add(emitMfmaInstruction( + self.writer, self.kernel, aTile, bTile, dTile, dTile, + scaleAVgpr=scaleAVgpr, scaleBVgpr=scaleBVgpr, + scaleAsel=sAsel, scaleBsel=sBsel, + comment=f"MFMA C[{a},{b}] += A[{a},K={subIterK}] * B[{b},K={subIterK}]")) + return list(module.flatitems()) + + def emit_lr(self, placement, unroll_iter=0): + """Emit LR (ds_read) instructions from LRPlacement.""" + module = Module() + tensor = placement.tensor + tile_map = placement.vgpr_tile_map[unroll_iter] if placement.vgpr_tile_map else {} + + if tensor in ('A', 'B'): + ti = self.tileInfoMap[tensor] + vgprTiles = self.vgprTilesA if tensor == 'A' else self.vgprTilesB + lrGran = self.config.lrA if tensor == 'A' else self.config.lrB + for tileId in range(placement.tiles.tileId_start, placement.tiles.tileId_end, lrGran.mn): + for k in range(placement.tiles.subIterK_start, placement.tiles.subIterK_end, lrGran.k): + subtileK = k // self.subtileShapeK + subIterK_within = k % self.subtileShapeK + dstTile = vgprTiles[tile_map[tileId]] + module.add(emitSingleDsRead( + ti, tileId, subtileK, subIterK_within, dstTile)) + elif tensor in ('SA', 'SB'): + tc = 'MXSA' if tensor == 'SA' else 'MXSB' + ti = self.tileInfoMap[tensor] + lrGran = self.config.lrSA if tensor == 'SA' else self.config.lrSB + vgprTilesScale = self.vgprTilesSA if tensor == 'SA' else self.vgprTilesSB + groupStride = lrGran.mn * ti.subtileSize + subtileK = placement.tiles.subIterK_start // self.subtileShapeK + for tileId in range(placement.tiles.tileId_start, placement.tiles.tileId_end, lrGran.mn): + scaleGroupIdx = tileId // lrGran.mn + groupKey = scaleGroupIdx * lrGran.mn + dsOffset = groupStride * (scaleGroupIdx * (self.config.numSubIterK // self.subtileShapeK) + subtileK) + vdst = next(iter(vgprTilesScale[tile_map[groupKey]])) + module.add(DSLoadB32( + dst=vgpr(vdst), + src=vgpr(ti.sharedVgprLROffset[0]), + ds=DSModifiers(offset=dsOffset), + comment=f"scale{tc}[group{scaleGroupIdx},K={placement.tiles.subIterK_start}]: load 4B from LDS")) + return list(module.flatitems()) + + def emit_gr(self, placement): + """Emit GR (buffer_load) instructions from GRPlacement.""" + module = Module() + tensor = placement.tensor + if tensor in ('A', 'B'): + ti = self.tileInfoMap[tensor] + grGran = self.config.grA if tensor == 'A' else self.config.grB + for tileId in range(placement.tiles.tileId_start, placement.tiles.tileId_end, grGran.mn): + for k in range(placement.tiles.subIterK_start, placement.tiles.subIterK_end, grGran.k): + subtileK = k // self.subtileShapeK + module.add(emitSingleBufferLoad(ti, self.kernel, tileId, subtileK)) + elif tensor in ('SA', 'SB'): + tc = 'MXSA' if tensor == 'SA' else 'MXSB' + module.add(globalReadDoScaleSubtile(tc, self.writer, self.kernel)) + return list(module.flatitems()) + + def emit_wait_gr(self, source): + """Emit SWaitCnt for wait_gr from BaseOp with wait_gr_counts.""" + counts = source.wait_gr_counts + if counts is None: + return [] + + # TODO. Hardcoded for now, but we should just get this from atomic emit codes (emitSingleBufferLoad, ...) + grMap = {'A': max(1,int(1.0/self.tileInfoA.loadRatioGR)), + 'B': max(1,int(1.0/self.tileInfoB.loadRatioGR)), + 'SA': 1, + 'SB': 1} + grCnt = (counts.A * grMap['A'] + + counts.B * grMap['B'] + + counts.SA * grMap['SA'] + + counts.SB * grMap['SB']) + return [SWaitCnt(vlcnt=grCnt, vscnt=-1, + comment=f"Wait GR (per-subIterK): A={counts.A} B={counts.B} SA={counts.SA} SB={counts.SB}")] + + def emit_wait_lr(self): + return [SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, + comment="Wait for LR to complete")] + + def emit_sync(self): + return [SBarrier(comment="Barrier")] + + def emit_lr_inc(self, source): + """Emit localReadLDSBufferSwap for a single tensor.""" + tensor = source.tensor + tc = {'A': 'A', 'B': 'B', 'SA': 'MXSA', 'SB': 'MXSB'}.get(tensor, tensor) + module = Module() + module.add(localReadLDSBufferSwap(tc, self.writer, self.kernel)) + return list(module.flatitems()) + + def emit_gr_inc(self, source): + """Emit globalReadPtrUpdates + globalReadLDSBufferSwap for a single tensor.""" + tensor = source.tensor + tc = {'A': 'A', 'B': 'B', 'SA': 'MXSA', 'SB': 'MXSB'}.get(tensor, tensor) + module = Module() + if tensor in ('SA', 'SB'): + module.add(globalReadScalePtrUpdates(tc, self.writer, self.kernel)) + else: + module.add(globalReadPtrUpdates(tc, self.writer, self.kernel)) + module.add(globalReadLDSBufferSwap(tc, self.writer, self.kernel)) + return list(module.flatitems()) + + def emit_skip(self, source): + """Emit skip guard: compare LoopCounterL and branch.""" + skipLabel = Label(f"SkipTo{source.target}", "") + cmpMap = {"EQ": SCmpEQU32, "LE": SCmpLeU32} + return [ + cmpMap[source.compare](src0=sgpr("LoopCounterL"), src1=source.value, + comment=f"LoopCounter {source.compare} {source.value}?"), + SCBranchSCC1(labelName=skipLabel.getLabelName(), + comment=f"skip to {source.target}"), + ] + + def populate(self, emitted, unroll_iter=0): + """Walk emitted partitions and fill em.instructions.""" + for partition_emitted in emitted: + for emitted_group in partition_emitted: + for em in emitted_group: + handler = self._dispatch.get(em.opType) + if handler: + em.instructions = handler(em, unroll_iter) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedInstructionScheduler.py b/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedInstructionScheduler.py new file mode 100644 index 00000000000..028e55186f0 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedInstructionScheduler.py @@ -0,0 +1,433 @@ +"""Instruction scheduler for subtile-based mainloop. + +Interleaves non-MFMA instructions between MFMAs using a slot-based placer +with pluggable scheduling rules. +""" + +from typing import List, Tuple, Optional +from rocisa.code import Module +from rocisa.instruction import SWaitCnt, MFMAInstruction, MXMFMAInstruction, \ + LocalReadInstruction, GlobalReadInstruction, CommonInstruction + + +class _SlotPlacer: + """Generic slot placement engine for interleaving instructions between MFMAs. + + Each interval (pair of adjacent MFMAs) has 2 placement slots. + Rules are injected via callbacks: + - validators: (placer, pos, inst) -> bool — reject invalid slots + - adjusters: (placer, limit, inst) -> limit — shift search start + - onPlace: (placer, pos, inst) -> None — update rule state after placement + """ + + def __init__(self, intervals: int, numModules: int, + pathOrders: List[List[int]], + validators=None, adjusters=None, onPlace=None): + self.totalSlots = intervals * 2 + self._n = numModules + self._prevInPath: List[int] = [-1] * numModules + self._nextInPath: List[int] = [-1] * numModules + for order in pathOrders: + for a, b in zip(order, order[1:]): + self._prevInPath[b] = a + self._nextInPath[a] = b + self._validators = validators or [] + self._adjusters = adjusters or [] + self._onPlace = onPlace + + self._placed: List[List[Tuple[int, object]]] = [[] for _ in range(self.totalSlots)] + self._firstPos: List[Optional[int]] = [None] * numModules + self._lastPos: List[Optional[int]] = [None] * numModules + self.leftovers: List[Tuple[int, object]] = [] + + # ── Placement ── + + def _canPlace(self, pos: int, inst) -> bool: + if pos < 0 or pos >= self.totalSlots or len(self._placed[pos]) >= 2: + return False + return all(v(self, pos, inst) for v in self._validators) + + def adjustLimit(self, limit: int, inst) -> int: + for adj in self._adjusters: + limit = adj(self, limit, inst) + return limit + + def bounds(self, mid: int) -> Tuple[int, int]: + lo = 0 + pred = self._prevInPath[mid] + if 0 <= pred < self._n and self._lastPos[pred] is not None: + lo = self._lastPos[pred] + 1 + hi = self.totalSlots - 1 + succ = self._nextInPath[mid] + if 0 <= succ < self._n and self._firstPos[succ] is not None: + hi = self._firstPos[succ] - 1 + return lo, hi + + def findSlot(self, mid: int, inst, limit: int, reverse: bool = False) -> Optional[int]: + lo, hi = self.bounds(mid) + if reverse: + hi = min(hi, limit) + else: + lo = max(lo, limit) + if hi < lo: + return None + for pos in (range(hi, lo - 1, -1) if reverse else range(lo, hi + 1)): + if self._canPlace(pos, inst): + return pos + return None + + def _forceSlot(self, mid: int, limit: int, reverse: bool) -> int: + """Find the closest valid slot respecting dependencies, allowing >2 items per slot.""" + lo, hi = self.bounds(mid) + if reverse: + hi = min(hi, limit) + lo = max(lo, 0) + if hi < lo: + hi = lo + return hi + else: + lo = max(lo, limit) + hi = min(hi, self.totalSlots - 1) + if lo > hi: + lo = hi + return lo + + def place(self, pos: int, item: Tuple[int, object], reverse: bool = False): + mid = item[0] + if reverse: + self._placed[pos].insert(0, item) + else: + self._placed[pos].append(item) + if self._firstPos[mid] is None or pos < self._firstPos[mid]: + self._firstPos[mid] = pos + if self._lastPos[mid] is None or pos > self._lastPos[mid]: + self._lastPos[mid] = pos + if self._onPlace: + self._onPlace(self, pos, item[1]) + + def placePath(self, pathInsts: List[Tuple[int, object]], reverse: bool = False): + """Place a sequence of (moduleId, instruction) items into slots. + + Walks pathInsts in order, applying adjusters (forward only) and + finding valid slots. When no empty slot is found, force-places at + the closest valid position respecting dependencies (allowing >2 + items per slot). + """ + limit = (self.totalSlots - 1) if reverse else 0 + for idx, item in enumerate(pathInsts): + mid, inst = item + if not reverse: + limit = self.adjustLimit(limit, inst) + pos = self.findSlot(mid, inst, limit, reverse=reverse) + if pos is None: + pos = self._forceSlot(mid, limit, reverse) + self.place(pos, item, reverse=reverse) + limit = (pos - 1) if reverse else (pos + 1) + + # ── Assembly ── + + def assemble(self, mfmas) -> Module: + intervals = len(mfmas) - 1 + result = Module() + result.add(mfmas[0]) + for i in range(intervals): + for slot in (2 * i, 2 * i + 1): + for item in self._placed[slot]: + result.add(item[1]) + result.add(mfmas[i + 1]) + for _, inst in self.leftovers: + result.add(inst) + return result + + +# ── Scheduling rules ── + +# Hardcoded gap to hide ds_read latency. TODO: compute this more accurately. +_MIN_MFMA_GAP_DS_READ_TO_WAIT = 4 + +_isDsRead = lambda x: isinstance(x, LocalReadInstruction) +_isBufferLoad = lambda x: isinstance(x, GlobalReadInstruction) +_isWaitCnt = lambda x: isinstance(x, SWaitCnt) +_isM0Update = lambda x: isinstance(x, CommonInstruction) and hasattr(x, 'dst') and hasattr(x.dst, 'regType') and x.dst.regType == 'm' + + +class _SchedulingRules: + """Scheduling rules for slot placement: validators, adjusters, and placement hooks. + + Owns all rule state (ds_read/waitcnt tracking, buffer-load spreading). + Bound methods are passed as callbacks to _SlotPlacer. + """ + + def __init__(self, totalSlots: int): + # Cross-path state + self.lastDsReadPos = -1 + self.earliestWaitCntPos = totalSlots + # Per-path state + self._resetPath() + + def _resetPath(self): + self.firstBufLoadPos: Optional[int] = None + self.bufLoadIdx = 0 + self.bufLoadMaxSlot = 0 + self.numBufLoads = 0 + + # ── Validators: (placer, pos, inst) -> bool ── + + def oneDsReadPerInterval(self, placer, pos, inst): + """At most one ds_read per interval (pair of slots) to avoid same SIMD pair stalls as we have a single codepath""" + if not _isDsRead(inst): + return True + peer = pos ^ 1 + return not (0 <= peer < placer.totalSlots + and any(_isDsRead(item[1]) for item in placer._placed[peer])) + + def minGapDsReadBeforeWait(self, placer, pos, inst): + """Reject ds_read too close to an already-placed waitcnt ahead.""" + if not _isDsRead(inst): + return True + gap = _MIN_MFMA_GAP_DS_READ_TO_WAIT * 2 + return self.earliestWaitCntPos - pos >= gap + + def minGapDsReadToWait(self, placer, pos, inst): + """Reject waitcnt too close to the last placed ds_read.""" + if not _isWaitCnt(inst) or self.lastDsReadPos < 0: + return True + gap = _MIN_MFMA_GAP_DS_READ_TO_WAIT * 2 + return pos - self.lastDsReadPos >= gap + + def noM0WithBufferLoad(self, placer, pos, inst): + """Avoid placing M0 updates and buffer_loads in the same MFMA interval.""" + if not _isM0Update(inst) and not _isBufferLoad(inst): + return True + peer = pos ^ 1 + slots = [pos] + if 0 <= peer < placer.totalSlots: + slots.append(peer) + if _isM0Update(inst): + return not any(_isBufferLoad(item[1]) for s in slots for item in placer._placed[s]) + return not any(_isM0Update(item[1]) for s in slots for item in placer._placed[s]) + + # ── Adjusters: (placer, limit, inst) -> limit ── + + def spreadBufferLoads(self, placer, limit, inst): + """Spread buffer_load instructions evenly across available range.""" + if not _isBufferLoad(inst) or self.bufLoadMaxSlot <= 0: + return limit + if self.firstBufLoadPos is not None: + stride = max(1, (self.bufLoadMaxSlot - self.firstBufLoadPos) + // self.numBufLoads) + limit = max(limit, self.firstBufLoadPos + + self.bufLoadIdx * stride) + self.bufLoadIdx += 1 + return limit + + # ── Placement hook: (placer, pos, inst) -> None ── + + def trackPlacement(self, placer, pos, inst): + """Update rule state after a successful placement.""" + if _isDsRead(inst): + self.lastDsReadPos = max(self.lastDsReadPos, pos) + if _isWaitCnt(inst): + self.earliestWaitCntPos = min(self.earliestWaitCntPos, pos) + if _isBufferLoad(inst) and self.firstBufLoadPos is None: + self.firstBufLoadPos = pos + + # ── Per-path setup ── + + def resetPath(self): + self._resetPath() + + def setupBufLoadSpreading(self, placer, pathInsts, order): + """Compute buffer-load spreading bounds for a forward path. + + Reserves tail slots for non-buffer-load instructions in modules that + follow the last GR module (e.g. GR_INC SRD updates, LDS buffer swaps). + """ + self.numBufLoads = sum(1 for _, inst in pathInsts if _isBufferLoad(inst)) + if self.numBufLoads > 1: + _, rawMax = placer.bounds(pathInsts[-1][0]) + grModuleIds = {mid for mid, inst in pathInsts if _isBufferLoad(inst)} + lastGrIdx = max(order.index(m) for m in grModuleIds if m in order) + tailModuleIds = set(order[lastGrIdx + 1:]) + numTailInsts = sum(1 for mid, _ in pathInsts if mid in tailModuleIds) + # this is an approximation as we don't know exactly how many slots will be use by modules after the GR yet (in this codepath) + self.bufLoadMaxSlot = max(0, rawMax - numTailInsts) + + +def _classifyPaths(pathOrders, emittedModules): + """Classify paths by wait_gr presence, sorted: wait_gr first, then by index.""" + paths = [] + for order in pathOrders: + hasWaitGR = any(emittedModules[i].opType == "wait_gr" for i in order) + paths.append((order, hasWaitGR)) + paths.sort(key=lambda p: (0 if p[1] else 1, p[0][0] if p[0] else 10**9)) + return paths + + +def _flattenPath(order, emittedModules, reverse=False): + """Flatten a path of module indices into (moduleId, instruction) pairs.""" + pathInsts = [(mid, inst) for mid in order for inst in emittedModules[mid].instructions] + if reverse: + pathInsts.reverse() + return pathInsts + + +def extractPathsFromBeforeDeps(emittedModules) -> Tuple[int, List[List[int]], List[List[int]]]: + """Extract non-MFMA dependency paths using only EmittedModule.before links. + + Returns: + (mfmaIdx, paths, preMfmaPaths) + - mfmaIdx: index of the MFMA emitted module in emittedModules + - paths: list of non-MFMA module-index paths to interleave between MFMAs + - preMfmaPaths: paths that must be emitted before the first MFMA + (reachable from the MFMA's before link) + """ + idToIdx = {em.moduleId: i for i, em in enumerate(emittedModules)} + n = len(emittedModules) + + mfmaModuleIds = [i for i, em in enumerate(emittedModules) if em.opType == "mfma"] + assert len(mfmaModuleIds) == 1, "extractPathsFromBeforeDeps expects exactly one MFMA emitted module" + mfmaIdx = mfmaModuleIds[0] + nonMfmaIds = [i for i in range(n) if i != mfmaIdx] + nonMfmaSet = set(nonMfmaIds) + + # Identify the non-MFMA module the MFMA depends on (if any). + mfmaBefore = emittedModules[mfmaIdx].before + preMfmaTarget = None + if mfmaBefore is not None: + bi = idToIdx.get(mfmaBefore) + if bi is not None and bi in nonMfmaSet: + preMfmaTarget = bi + + # Each non-MFMA module has at most one predecessor, and each predecessor + # has at most one child, so paths are simple chains. + pred: List[int] = [-1 for _ in range(n)] + child: List[int] = [-1 for _ in range(n)] + for i in nonMfmaIds: + parent = -1 + b = emittedModules[i].before + if b is not None: + bi = idToIdx.get(b) + if bi is not None and bi != i and bi in nonMfmaSet: + parent = bi + pred[i] = parent + if parent != -1: + assert child[parent] == -1, \ + f"extractPathsFromBeforeDeps expects unique child per predecessor, got {child[parent]} and {i} for {parent}" + child[parent] = i + + def _findHead(mid: int) -> int: + cur = mid + seen = [False for _ in range(n)] + while pred[cur] != -1 and not seen[cur]: + seen[cur] = True + cur = pred[cur] + return cur + + def _walkFromHead(head: int, used: List[bool]) -> List[int]: + order: List[int] = [] + localSeen = [False for _ in range(n)] + cur = head + while cur != -1 and not used[cur] and not localSeen[cur]: + order.append(cur) + localSeen[cur] = True + cur = child[cur] + return order + + used = [False for _ in range(n)] + paths: List[List[int]] = [] + for mid in nonMfmaIds: + if used[mid]: + continue + head = _findHead(mid) + order = _walkFromHead(head, used) + assert order, f"extractPathsFromBeforeDeps produced empty path for module {mid}" + for i in order: + used[i] = True + paths.append(order) + + # Separate paths that the MFMA depends on (must go before first MFMA). + preMfmaPaths: List[List[int]] = [] + regularPaths: List[List[int]] = [] + for path in paths: + if preMfmaTarget is not None and preMfmaTarget in path: + preMfmaPaths.append(path) + else: + regularPaths.append(path) + + return mfmaIdx, regularPaths, preMfmaPaths + + +def instructionSchedule(emittedModules): + """Interleave non-MFMA instructions between MFMAs using 2 slots/interval. + + Rules: + - MFMA order is preserved. + - Between two adjacent MFMAs there are 2 placement slots. + - At most one ds_read (LocalReadInstruction) per interval. + - Before dependencies are respected at module order level. + - Minimm distance between ds_read and it waitcnt (hardcoded for now) + - Module-internal instruction order is preserved. + - LR path containing a WAIT_GR is packed from the end backwards. We want WAIT_GR to be done as late as possible. + - GR path is spread as much as possible across remaining valid slots. No backwards here as we want GRs to be done as early as possible. + + TODO : To be tested on multi-partition setup. + """ + if not emittedModules: + return Module() + + isMFMA = lambda x: isinstance(x, (MFMAInstruction, MXMFMAInstruction)) + n = len(emittedModules) + + mfmaIdx, pathOrders, preMfmaOrders = extractPathsFromBeforeDeps(emittedModules) + mfmas = [x for x in emittedModules[mfmaIdx].instructions if isMFMA(x)] + + def _emitPreMfma(result): + for order in preMfmaOrders: + for mid in order: + for inst in emittedModules[mid].instructions: + result.add(inst) + + # Single MFMA: no slots to interleave into — emit preMfma, MFMA, then paths. + if len(mfmas) < 2: + result = Module() + _emitPreMfma(result) + for m in mfmas: + result.add(m) + for order in pathOrders: + for mid in order: + for inst in emittedModules[mid].instructions: + result.add(inst) + return result + + paths = _classifyPaths(pathOrders, emittedModules) + rules = _SchedulingRules(totalSlots=(len(mfmas) - 1) * 2) + placer = _SlotPlacer( + len(mfmas) - 1, n, pathOrders, + validators=[rules.oneDsReadPerInterval, rules.minGapDsReadBeforeWait, rules.minGapDsReadToWait, rules.noM0WithBufferLoad], + adjusters=[rules.spreadBufferLoads], + onPlace=rules.trackPlacement) + + for order, hasWaitGR in paths: + if not order: + continue + pathInsts = _flattenPath(order, emittedModules, reverse=hasWaitGR) + rules.resetPath() + if not hasWaitGR: + rules.setupBufLoadSpreading(placer, pathInsts, order) + placer.placePath(pathInsts, reverse=hasWaitGR) + + scheduled = Module() + _emitPreMfma(scheduled) + scheduled.add(placer.assemble(mfmas)) + + # Post-pass: adjust vmcnt of any SWaitCnt to account for buffer_loads + # that the scheduler placed before it within this subIterK. + bufLoadCount = 0 + for inst in scheduled.flatitems(): + if _isBufferLoad(inst): + bufLoadCount += 1 + elif _isWaitCnt(inst) and inst.vlcnt >= 0: + inst.vlcnt += bufLoadCount + + return scheduled diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedKernel.py b/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedKernel.py new file mode 100644 index 00000000000..19f1dbfa7dc --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedKernel.py @@ -0,0 +1,1850 @@ +import math +from collections import deque +from contextlib import contextmanager +from copy import deepcopy +from dataclasses import dataclass, field +from typing import Dict, List, NamedTuple, Optional, Tuple, Type + +from ..Common import printWarning, roundUp, print2, DebugConfig, DataDirection, \ + INDEX_CHARS, IsaVersion + + +from rocisa.code import Module, TextBlock, StructuredModule, KernelBody, Label +from rocisa.label import LabelManager + +from rocisa.container import MUBUFModifiers, vgpr, sgpr, accvgpr, mgpr +from rocisa.enum import InstType, SelectBit, CacheScope +from rocisa.instruction import MFMAInstruction + +import math +from copy import deepcopy +from dataclasses import dataclass, field +from typing import Dict, List, NamedTuple, Optional, Tuple, Type +from contextlib import contextmanager +from collections import deque +from rocisa import rocIsa, countInstruction, countGlobalRead, \ + countLocalRead, countLocalWrite, countDSStoreB256, getMFMAs +from rocisa.asmpass import rocIsaPass, rocIsaPassOption +from rocisa.code import KernelBody, Label, Module, StructuredModule, TextBlock +from rocisa.container import ( + DPPModifiers, DSModifiers, EXEC, HWRegContainer, MUBUFModifiers, + RegisterContainer, VCC, VOP3PModifiers, + accvgpr, mgpr, replaceHolder, sgpr, vgpr, +) +from rocisa.enum import CacheScope, DataTypeEnum, InstType, RegisterType, SelectBit +from rocisa.instruction import ( + BufferLoadB128, BufferLoadB32, BufferLoadB64, + BufferLoadD16B16, BufferLoadD16U8, + DSLoad2B32, DSLoad2B64, DSLoadB128, DSLoadB32, DSLoadB64, + DSLoadB64TrB16, DSLoadInstruction, DSLoadU16, DSLoadU8, + DSStore2B32, DSStore2B64, DSStoreB128, DSStoreB16, DSStoreB256, + DSStoreB32, DSStoreB64, DSStoreB8, DSStoreInstruction, + FlatLoadB128, FlatLoadB32, FlatLoadB64, + FlatStoreB128, FlatStoreB32, FlatStoreB64, + Instruction, MacroInstruction, + MFMAInstruction, MXMFMAInstruction, SMFMAInstruction, + SAddCU32, SAddU32, SBarrier, SBranch, + SCBranchSCC0, SCBranchSCC1, SCBranchVCCNZ, + SCmpEQU32, SCmpLeU32, SLShiftLeftB32, SLongBranchPositive, + SMovB32, SMovB64, SMulI32, SNop, + SSetPrior, SSetRegIMM32B32, SSubBU32, SSubU32, SWaitAlu, SWaitCnt, SXorB32, + VAccvgprWrite, VAddCCOU32, VAddCOU32, VAddU32, VAndB32, + VCmpXEqU32, VCndMaskB32, VFmaMixF32, VMadMixF32, + VLShiftLeftB32, VLShiftRightB32, VMovB32, VMovB64, + VMulLOU32, VPermlane16SwapB32, VReadfirstlaneB32, VSubU32, VXorB32, +) +from rocisa.label import LabelManager +from rocisa.register import RegisterPool +# Store various scheduling info +class ScheduleInfo: + + availableVgprATiles = field(init=False) + availableVgprBTiles = field(init=False) + + usedVgprATiles = field(init=False) + usedVgprBTiles = field(init=False) + + def __init__(self, aTileInfo, bTileInfo): + # TODOBS: check that vgpr tiles are init first before calling these + self.availableVgprATiles = deque(list(range(len(aTileInfo.vgprTiles)))) + self.availableVgprBTiles = deque(list(range(len(bTileInfo.vgprTiles)))) + + self.usedVgprATiles = dict() + self.usedVgprBTiles = dict() + + +# Tile info +class TileInfo: + + class RegisterList: + regPool = field(init=False) + regValues : List[int] = field(init=False) + + def __init__(self, pool): + self.regPool = pool + self.regValues = [] + + def append(self, val): + self.regValues.append(val) + + def index(self, val): + return self.regValues.index(val) + + def __iter__(self): + for vals in self.regValues: + yield vals + + def __len__(self): + return len(self.regValues) + + def __str__(self): + return str(self.regValues) + + class RegisterTileInfo: + tileSize: int = 0 + regList = field(init=False) + + def __init__(self, pool): + self.regList = TileInfo.RegisterList(pool) + + def append(self, val): + self.regList.append(val) + + def index(self, val): + return self.regList.index(val) + + def __iter__(self): + # The generator automatically handles the iteration logic + for vals in self.regList: + yield vals + + def __str__(self): + return str(self.regList) + + class SubtileInfo: + tc: str = field(init=False) + subtileId: List[int] = field(init=False) + + # List of GR that loads this subtile + globalReadMap: List[int] = field(init=False) + # List of LR that loads this subtile + localReadMap: List[int] = field(init=False) + + # Store registers used for constant offsets + useSgpr = field(init=False) + regListId: int = -1 + + def __init__(self, tc, subtileId): + self.tc = tc + self.subtileId = subtileId + self.globalReadMap = [] + self.localReadMap = [] + + tc: str = field(init=False) + bpe: float = 0 + depthUBytes: int = 0 # Num bytes in K dim for all subtiles + subIterKBytes: int = 0 # Num bytes in K dim for one subtile + loadWidthGR: int = 16 # Always assume widest load width for global reads + loadWidthLR: int = 0 # load width in bytes for local reads + isSwizzled: bool = False + + # MMA Shape is w.r.t to data element (not size in bytes) + # + mmaTileShape: List[int] = field(init=False) + mmaTileSize: int = 0 # subtile size in bytes + mmaTileLocalTotalCount: int = 0 # total number of mmaTiles + mmaTileRegCount: float = 0 # number of registers needed for per mma tile for specific A/B matrix + + subtileShape: List[int] = field(init=False) + subtileSize: int = 0 # subtile size in bytes + subtileLocalTotalCount: int = 0 + + globalMMATileGrid: List[int] = field(init=False) + globalSubtileGrid: List[int] = field(init=False) + + localMMATileGrid: List[int] = field(init=False) + localSubtileGrid: List[int] = field(init=False) + + localSubtiles: List[SubtileInfo] = field(init=False) + localSubtilesRegister: List[RegisterList] = field(init=False) + + loadRatioGR: int = 0 + numGRPerSubtile: int = 0 # may not be needed + numGRTotal: int = 0 + + loadRatioLR: int = 0 + numLRPerSubtile: int = 0 # may not be needed + numLRTotal: int = 0 + + sharedVgprGROffset: List[int] = field(init=False) + sharedVgprLROffset: List[int] = field(init=False) + sharedVgprLROffsetSwap: List[int] = field(init=False) + + vgprTileFactor: float = 1.0 + # VGPR buffers available for this tile + vgprTiles: List[RegisterTileInfo] = field(init=False) + + # MX scale fields (set for A/B when mxBlock > 0, else 0) + mxBlock: int = 0 + + def __init__(self, tc, kernel): + isAB = tc in ['A', 'B'] + isMXSAB = tc in ['MXSA', 'MXSB'] + + self.subtileShape = [1, 2] + + self.tc = tc + self.isSwizzled = isMXSAB + + isA = tc in ['A', 'MXSA'] + _tc = 'A' if isA else 'B' + + if isAB or isMXSAB: + + # TODO query vgpr factors from kernel + self.vgprTileFactor = 1.0 if tc == 'A' else 1.0 + miWaveGroupSize0 = kernel["MIWaveGroup"][0 if isA else 1] + miWaveGroupSize1 = 1 + + macroTile = kernel["MacroTileA"] if isA else kernel["MacroTileB"] + depthU = kernel["_DepthU%s"%tc] + # TODO: Need to update ProblemType to query scale size? + bpe = kernel["ProblemType"]["DataType%s"%tc].numBytes() if isAB else 1 + self.bpe = bpe + self.depthUBytes = int(depthU * bpe) + + numWaves = kernel["MIWaveGroup"][0] * kernel["MIWaveGroup"][1] + + # Always assumes widest load is used + loadWidth = 128 + # Number of bytes loaded per wave with widest load width, global + numBytesGRPerWave = (loadWidth // 8) * kernel["WavefrontSize"] + # Number of bytes loaded per wave with widest load width, local + numBytesLRPerWave = (loadWidth // 8) * kernel["WavefrontSize"] + + # MMA Tile Shape is based on matrix instruction + mmaTileShape0 = kernel["MatrixInstM"] + mmaTileShape1 = kernel["MatrixInstK"] + if isMXSAB: + mmaTileShape1 //= kernel["ProblemType"].get("MXBlock%s"%_tc) + self.mmaTileShape = [mmaTileShape0, mmaTileShape1] + mmaTileGrid0 = macroTile // mmaTileShape0 + mmaTileGrid1 = depthU // mmaTileShape1 + + subtileShape0 = self.subtileShape[0] + subtileShape1 = self.subtileShape[1] + else: # Tile info for C matrix + # TODOBS: check if 'C' or 'D'.. decide which to use + self.vgprTileFactor = 1.0 + miWaveGroupSize0 = kernel["MIWaveGroup"][0] + miWaveGroupSize1 = kernel["MIWaveGroup"][1] + + macroTile = kernel["MacroTile0"] + depthU = kernel["MacroTile1"] + bpe = kernel["ProblemType"]["ComputeDataType"].numBytes() + self.bpe = bpe + self.depthUBytes = int(depthU * bpe) + + numWaves = kernel["MIWaveGroup"][0] * kernel["MIWaveGroup"][1] + + # Always assumes widest load is used + loadWidth = 128 + # Number of bytes loaded per wave with widest load width, global + numBytesGRPerWave = (loadWidth // 8) * kernel["WavefrontSize"] + # Number of bytes loaded per wave with widest load width, local + numBytesLRPerWave = (loadWidth // 8) * kernel["WavefrontSize"] + + # MMA Tile Shape is based on matrix instruction + mmaTileShape0 = kernel["MatrixInstM"] + mmaTileShape1 = kernel["MatrixInstM"] + self.mmaTileShape = [mmaTileShape0, mmaTileShape1] + mmaTileGrid0 = macroTile // mmaTileShape0 + mmaTileGrid1 = depthU // mmaTileShape1 + + subtileShape0 = self.subtileShape[0] + subtileShape1 = self.subtileShape[0] + + assert kernel["MatrixInstM"] == 16, \ + "SubtileBasedKernel only supports MatrixInstM=16, got %u" % kernel["MatrixInstM"] + assert kernel["MatrixInstK"] in (32, 128), \ + "SubtileBasedKernel only supports MatrixInstK=32 (bf16) or MatrixInstK=128 (mxfp4), got %u" % kernel["MatrixInstK"] + + self.mmaTileSize = int(mmaTileShape0 * mmaTileShape1 * bpe) + self.loadWidthLR = self.mmaTileSize // kernel["WavefrontSize"] + # Number of registers needed for one tile, count w.r.t dword + self.mmaTileRegCount = (self.mmaTileSize // kernel["WavefrontSize"]) / 4 + # Number of mma tiles for each wave + self.mmaTileLocalTotalCount = (mmaTileGrid0 // miWaveGroupSize0) * (mmaTileGrid1 // miWaveGroupSize1) + + # Subtile Shape is w.r.t of units of mma tile + + self.subtileSize = subtileShape0 * subtileShape1 * self.mmaTileSize + + # TODO: This won't be needed if we assume all loads are split + # Compute number of mfma tiles globally in MT0 dim + + self.globalMMATileGrid = [mmaTileGrid0, mmaTileGrid1] + self.globalSubtileGrid = [mmaTileGrid0 // subtileShape0, mmaTileGrid1 // subtileShape1] + + # Compute number of mfma tiles locally (wave pov) + self.localMMATileGrid = deepcopy(self.globalMMATileGrid) + self.localSubtileGrid = deepcopy(self.globalSubtileGrid) + self.localMMATileGrid[0] //= miWaveGroupSize0 + self.localSubtileGrid[0] //= miWaveGroupSize0 + self.localMMATileGrid[1] //= miWaveGroupSize1 + self.localSubtileGrid[1] //= miWaveGroupSize1 + + self.subIterKBytes = self.depthUBytes // self.localSubtileGrid[1] + + self.subtileLocalTotalCount = self.localSubtileGrid[0] * self.localSubtileGrid[1] + + # Allocate subtileInfo structs + self.localSubtiles = [] + self.localSubtilesRegister = [] + for sId0 in range(self.localSubtileGrid[0]): + for sId1 in range(self.localSubtileGrid[1]): + self.localSubtiles.append(TileInfo.SubtileInfo(tc, [sId0, sId1])) + + if isAB or isMXSAB: + # Compute load ratio + # Represents the amount of subtiles fetched by a single global load across all waves + # < 1 means a global load fetches multiple subtiles + self.loadRatioGR = (numBytesGRPerWave * numWaves) / self.subtileSize / miWaveGroupSize0 + self.numGRPerSubtile = int(math.ceil(1/self.loadRatioGR)) + self.numGRTotal = int((self.localSubtileGrid[0] * self.localSubtileGrid[1]) / self.loadRatioGR) + + # Compute load ratio + # Represents the amount of subtiles fetched by a single ds_read + # < 1 means a global load fetches multiple subtiles + self.loadRatioLR = (numBytesLRPerWave) / self.subtileSize + self.numLRPerSubtile = int(math.ceil(1/self.loadRatioLR)) + self.numLRTotal = int((self.localSubtileGrid[0] * self.localSubtileGrid[1]) / self.loadRatioLR) + + # Scale tensor geometry (MX block scaling) + mxBlockKey = "MXBlock%s"%_tc + self.mxBlock = kernel["ProblemType"].get(mxBlockKey, 0) + + # Map subtiles to GR + for sId0 in range(self.localSubtileGrid[0]): + for sId1 in range(self.localSubtileGrid[1]): + linearId = self.getLocalSubtileLinearId(sId0, sId1) + subtileInfo = self.localSubtiles[linearId] + baseGR = math.floor(linearId / self.loadRatioGR) + for nGL in range(self.numGRPerSubtile): + subtileInfo.globalReadMap.append(baseGR + nGL) + baseLR = math.floor(linearId / self.loadRatioLR) + for nLL in range(self.numLRPerSubtile): + subtileInfo.localReadMap.append(baseLR + nLL) + # print("GR map", sId0, sId1, subtileInfo.globalReadMap) + # print("LR map", sId0, sId1, subtileInfo.localReadMap) + + + + def __str__(self): + lines = [ + f"TileInfo(tc={self.tc})", + f" mmaTileShape: {self.mmaTileShape if isinstance(getattr(self, 'mmaTileShape', None), list) else 'not set'}", + f" mmaTileSize: {self.mmaTileSize} bytes", + f" mmaTileRegCount: {self.mmaTileRegCount}", + f" mmaTileLocalTotalCount: {self.mmaTileLocalTotalCount}", + f" subtileShape: {self.subtileShape}", + f" subtileSize: {self.subtileSize} bytes", + f" subtileLocalTotalCount: {self.subtileLocalTotalCount}", + f" globalMMATileGrid: {self.globalMMATileGrid}", + f" globalSubtileGrid: {self.globalSubtileGrid}", + f" localMMATileGrid: {self.localMMATileGrid}", + f" localSubtileGrid: {self.localSubtileGrid}", + f" mxBlock: {self.mxBlock}", + f" loadRatioGR: {self.loadRatioGR}", + f" numGRPerSubtile: {self.numGRPerSubtile}", + f" numGRTotal: {self.numGRTotal}", + f" loadRatioLR: {self.loadRatioLR}", + f" numLRPerSubtile: {self.numLRPerSubtile}", + f" numLRTotal: {self.numLRTotal}", + f" vgprTileFactor: {self.vgprTileFactor}", + f" vgprTiles: {[str(t) for t in self.vgprTiles] if isinstance(getattr(self, 'vgprTiles', None), list) else 'not allocated'}", + f" sharedVgprGROffset: {self.sharedVgprGROffset if isinstance(getattr(self, 'sharedVgprGROffset', None), list) else 'not allocated'}", + f" sharedVgprLROffset: {self.sharedVgprLROffset if isinstance(getattr(self, 'sharedVgprLROffset', None), list) else 'not allocated'}", + ] + return "\n".join(lines) + + #################################### + # Given 2d local mma tile Id, return 2d id for local subtile containing that tile + def getLocalSubtileIdFromMMATile(self, mmaId0, mmaId1): + return [mmaId0 // self.subtileShape[0], mmaId1 // self.subtileShape[1]] + + def getLocalSubtileLinearId(self, sId0, sId1): + # Returns linear id for subtiles assumes block col major format + return sId1 * self.localSubtileGrid[0] + sId0 + + def getLocalSubtileIdFromLinearId(self, linearId): + sId0 = linearId % self.localSubtileGrid[0] + sId1 = linearId // self.localSubtileGrid[0] + return [sId0, sId1] + + def getSubtileShapeLinearId(self, k0, k1): + # Returns linear id within a subtile, col major + return k1 * self.subtileShape[0] + k0 + + def getLocalMMATileLinearId(self, mmaId0, mmaId1): + # Returns linear id for subtiles assumes block col major format + return mmaId1 * self.localMMATileGrid[0] + mmaId0 + + def allocOffsetRegisters(self, writer, kernel): + self.sharedVgprGROffset = [] + self.sharedVgprLROffset = [] + self.sharedVgprLROffsetSwap = [] + + isSwizzledScales = self.isSwizzled and self.tc in ['MXSA', 'MXSB'] + + # Allocate share vgprs for GR + for i in range(self.numGRPerSubtile): + self.sharedVgprGROffset.append(writer.vgprPool.checkOut(1)) + + # Allocate shared vgprs for LR + for i in range(self.numLRPerSubtile): + self.sharedVgprLROffset.append(writer.vgprPool.checkOut(1)) + self.sharedVgprLROffsetSwap.append(writer.vgprPool.checkOut(1)) + + # For swizzled scale layout, we assume we can stream. + # So only need shared vgprs for GR + if isSwizzledScales: + return + + # Allocate registers for each subtile + # TODOBS: Check TLU instead of hardcoding False + perpDimSize = (self.localSubtileGrid[1] if False else self.localSubtileGrid[0]) + if self.loadRatioGR == 2.0: + perpDimSize = math.ceil(perpDimSize / self.loadRatioGR) + + # TOBODS: Can this be done better + for reg in range(perpDimSize): + tmpSgprBuffer = 3 # Hardcoded for now, the amount of sgprs to use for temps + sgprLimit = writer.states.regCaps["MaxSgpr"] - tmpSgprBuffer + regPool = writer.sgprPool if writer.sgprPool.size() < sgprLimit else writer.vgprPool + self.localSubtilesRegister.append(TileInfo.RegisterList(regPool)) + # No registers needed for perp 0 + if reg == 0: + continue + if regPool == writer.sgprPool: + # TODOBS: Need to prevent overflow here, better way to do it? + self.localSubtilesRegister[-1].append(regPool.checkOut(1, preventOverflow=False)) + else: + for i in range(self.numGRPerSubtile): + self.localSubtilesRegister[-1].append(regPool.checkOut(1, preventOverflow=False)) + # Iterate through subtiles and allocate vgpr/sgpr if needed + linearId = 0 + for st in self.localSubtiles: + + # Get 2D Id for subtile + sId0, sId1 = self.getLocalSubtileIdFromLinearId(linearId) + linearId += 1 + + # TODOBS: Check TLU instead of hardcoding + slowId = sId1 if False else sId0 + # Only associate a SGPR to 1 other subtile when loadRatioGR == 2.0 + if self.loadRatioGR == 2.0: + slowId = int(slowId // self.loadRatioGR) + st.regListId = slowId + st.useSgpr = self.localSubtilesRegister[slowId].regPool == writer.sgprPool + + def allocVgprTileRegisters(self, writer, kernel, schedulerManaged=False): + self.vgprTiles = [] + + numMMATiles = self.localMMATileGrid[0] * self.localMMATileGrid[1] + numMMATilesPerReg = max(1, int(1//self.mmaTileRegCount)) + + for i in range(int(self.vgprTileFactor * numMMATiles)): + # Determine which pool to allocate registers from + if self.tc in ['A', 'B', 'MXSA', 'MXSB']: + self.vgprTiles.append(TileInfo.RegisterTileInfo(writer.vgprPool)) + else: + useAgpr = True + if useAgpr: + maxAgpr = writer.states.regCaps["PhysicalMaxVgpr"] - writer.states.regCaps["MaxVgpr"] + # TODOBS: agpr limit is hardcoded here.. fix + if writer.agprPool.size() < maxAgpr: + self.vgprTiles.append(TileInfo.RegisterTileInfo(writer.agprPool)) + else: + self.vgprTiles.append(TileInfo.RegisterTileInfo(writer.vgprPool)) + + if i % numMMATilesPerReg != 0: + continue + # TODOBS: Hard code this block for now? + numDword = int(math.ceil(self.mmaTileRegCount)); + for j in range(0, numDword, numDword): + pool = self.vgprTiles[-1].regList.regPool + vstart = pool.checkOutAligned(numDword,numDword) + for k in range(numDword): + self.vgprTiles[-1].append(vstart + k) + + def allocScaleVgprTiles(self, writer, kernel): + if self.mxBlock == 0: + return + numScaleVgprs = math.ceil(self.localMMATileGrid[0] / 2) + self.scaleVgprTiles = [] + for i in range(numScaleVgprs): + self.scaleVgprTiles.append(writer.vgprPool.checkOut(1)) + + def deallocOffsetRegisters(self, writer, kernel): + # checkin GR registers + for voff in self.sharedVgprGROffset: + writer.vgprPool.checkIn(voff) + # checkin LR registers + for voff in self.sharedVgprLROffset: + writer.vgprPool.checkIn(voff) + # checkin LR registers + for voff in self.sharedVgprLROffsetSwap: + writer.vgprPool.checkIn(voff) + + for reg in self.localSubtilesRegister: + regPool = reg.regPool + for val in reg.regValues: + regPool.checkIn(val) + + def deallocScaleVgprTiles(self, writer, kernel): + for sv in self.scaleVgprTiles: + writer.vgprPool.checkIn(sv) + self.scaleVgprTiles = [] + + def deallocVgprTileRegisters(self, writer, kernel): + numMMATilesPerReg = max(1, int(1 // self.mmaTileRegCount)) + for i, vtiles in enumerate(self.vgprTiles): + if i % numMMATilesPerReg != 0: + continue + pool = vtiles.regList.regPool + if vtiles.regList.regValues: + pool.checkIn(vtiles.regList.regValues[0]) + +def _computeLROffset(module, kernel, tileInfo, colOffset, rowOffset): + tc = tileInfo.tc + wavesize = kernel["WavefrontSize"] + subIterKBytes = tileInfo.subIterKBytes + loadWidth = tileInfo.loadWidthLR + numMFMACols = int(tileInfo.mmaTileShape[1] * tileInfo.bpe) // loadWidth # TN case only + blockSize = subIterKBytes // loadWidth + + module.add(VMovB32(dst=vgpr(tileInfo.sharedVgprLROffset[0]), src=vgpr(colOffset), comment="%s: laneId"%tc)) + for vgprId in range(1, len(tileInfo.sharedVgprLROffset)): + module.add(VAddU32(dst=vgpr(tileInfo.sharedVgprLROffset[vgprId]), src0=vgpr(tileInfo.sharedVgprLROffset[vgprId-1]), src1=hex(numMFMACols), comment="%s: colOffset for MFMA %u of subtile"%(tc, vgprId))) + module.add(VAndB32(dst=vgpr(tileInfo.sharedVgprLROffset[vgprId]), src0=vgpr(tileInfo.sharedVgprLROffset[vgprId]), src1=hex(blockSize-1), comment="%s: colOffset = colOffset %% block_size"%tc)) + + for vgprId in range(0, len(tileInfo.sharedVgprLROffset)): + module.add(VLShiftLeftB32(dst=vgpr(tileInfo.sharedVgprLROffset[vgprId]), shiftHex=hex(loadWidth.bit_length()-1), src=vgpr(tileInfo.sharedVgprLROffset[vgprId]), comment="%s: colOffset*loadWidth"%tc)) + module.add(VAddU32(dst=vgpr(tileInfo.sharedVgprLROffset[vgprId]), src0=vgpr(tileInfo.sharedVgprLROffset[vgprId]), src1=vgpr(rowOffset), comment="%s: row + col"%tc)) + +def _applyWavePartitionLROffset(module, writer, kernel, tileInfo): + """Apply wave-based partition offset to LR offsets. + + loadRatioGR >= 2.0: no partition needed, contiguous subtiles (1x4 for A , 4x1 for B) + loadRatioGR == 1.0: 2x2 config, each wave loads half of the subtile + loadRatioGR == 0.5: 4x1 for A , 1x4 for B. Split in 4 subtiles groups + """ + tc = tileInfo.tc + + if tileInfo.loadRatioGR >= 2.0: + return + + wavesize = kernel["WavefrontSize"] + subIterKBytes = tileInfo.subIterKBytes + MT = tileInfo.globalMMATileGrid[0] * tileInfo.mmaTileShape[0] + + waveId = writer.vgprPool.checkOut(1) + module.add(VLShiftRightB32(dst=vgpr(waveId), shiftHex=hex(wavesize.bit_length()-1), src=vgpr("Serial"), comment="waveId")) + + if tileInfo.loadRatioGR == 1.0: + # W0 W2 + # W1 W3 + # W1-3 : A / W2-3 : B + if tc == 'A': + module.add(VAndB32(dst=vgpr(waveId), src0=hex(1), src1=vgpr(waveId), comment="%s: waveId %% 2"%tc)) + else: + module.add(VLShiftRightB32(dst=vgpr(waveId), shiftHex=hex(1), src=vgpr(waveId), comment="%s: waveId / 2"%tc)) + + sInterval = MT * subIterKBytes // 2 + elif tileInfo.loadRatioGR == 0.5: + sInterval = MT * subIterKBytes // 4 + else: + raise NotImplementedError("Unsupported loadRatioGR for wave partition: %s"%str(tileInfo.loadRatioGR)) + + tmpSgpr = writer.sgprPool.checkOut(1) + module.add(SMovB32(dst=sgpr(tmpSgpr), src=hex(sInterval), comment="%s: interleave stride"%tc)) + module.add(VMulLOU32(dst=vgpr(waveId), src1=vgpr(waveId), src0=sgpr(tmpSgpr), comment="")) + for vgprId in range(len(tileInfo.sharedVgprLROffset)): + module.add(VAddU32(dst=vgpr(tileInfo.sharedVgprLROffset[vgprId]), src0=vgpr(tileInfo.sharedVgprLROffset[vgprId]), src1=vgpr(waveId), comment="%s: wave partition LR offset"%tc)) + + writer.vgprPool.checkIn(waveId) + writer.sgprPool.checkIn(tmpSgpr) + +def _lraWavePartitioning(module, writer, kernel): + """Compute waveId and apply per-matrix wave partition offsets.""" + tileInfoA = writer.states.a.tileInfo + tileInfoB = writer.states.b.tileInfo + _applyWavePartitionLROffset(module, writer, kernel, tileInfoA) + _applyWavePartitionLROffset(module, writer, kernel, tileInfoB) + + +def setExecMask(module, writer, maskLo, maskHi): + tmpSgpr = writer.sgprPool.checkOutAligned(2, 2, "setExecMask tmpSgpr", False) + module.add(SMovB32(dst=sgpr(tmpSgpr), src=hex(maskLo), comment="exec mask lo")) + module.add(SMovB32(dst=sgpr(tmpSgpr+1), src=hex(maskHi), comment="exec mask hi")) + module.add(SMovB64(dst=EXEC(), src=sgpr(tmpSgpr, 2), comment="Set exec mask")) + writer.sgprPool.checkIn(tmpSgpr) + +################################################## +# Subroutine to generate LR offset calculation code +# +def lraTileAssignment(writer, kernel): + module = Module() + module.addComment0("LR Offset Calculation for Subtile Based Tiling") + + tileInfoA = writer.states.a.tileInfo + tileInfoB = writer.states.b.tileInfo + + # Input Parameters. + subIterKBytes = tileInfoA.subIterKBytes + wavesize = kernel["WavefrontSize"] + + mi_m = tileInfoA.mmaTileShape[0] + loadWidth = tileInfoA.loadWidthLR + ldsRowBankSize = 64*4 # 64 banks, 4 bytes per bank + numRowsPerLDSBanks = ldsRowBankSize // subIterKBytes + assert tileInfoA.mmaTileShape == tileInfoB.mmaTileShape, "Expect same MMA tile shape for A and B" + + blockSize = subIterKBytes // loadWidth + + tmpVgpr = writer.vgprPool.checkOut(6) + lane16, lane16Group, rotation, rowOffset, colOffset = range(tmpVgpr, tmpVgpr + 5) + + # Calculate lane16 and lane16Group for current wave (used by MFMA layout) + module.add(VAndB32(dst=vgpr(lane16Group), src0=vgpr("Serial"), src1=wavesize-1, comment="laneId")) + module.add(VLShiftRightB32(dst=vgpr(lane16Group), shiftHex=hex(mi_m.bit_length()-1), src=vgpr(lane16Group), comment="lane16Group")) + module.add(VAndB32(dst=vgpr(lane16), src0=vgpr("Serial"), src1=mi_m-1, comment="laneId % 16")) + + swizzling = True + if swizzling: + # Get lds row id + module.add(VLShiftRightB32(dst=vgpr(rotation), shiftHex=hex(numRowsPerLDSBanks.bit_length()-1), src=vgpr(lane16), comment="lds_row_id")) + module.add(VLShiftRightB32(dst=vgpr(rotation), shiftHex=hex(1), src=vgpr(rotation), comment="(lds_row_id //2 )")) + # Calculate rotation + module.add(VLShiftLeftB32(dst=vgpr(rotation), shiftHex=hex(1), src=vgpr(rotation), comment="rotation=(lds_row_id //2) * 2")) + # Apply rotation on Col + module.add(VAddU32(dst=vgpr(colOffset), src0=vgpr(rotation), src1=vgpr(lane16Group), comment="colOffset = rotation + lane16Group")) + module.add(VAndB32(dst=vgpr(colOffset), src0=vgpr(colOffset), src1=hex(blockSize-1), comment="colOffset = colOffset % blockSize")) + # Swizzle col + setExecMask(module, writer, 0x33333333, 0x33333333) + module.add(VPermlane16SwapB32(dst=vgpr(colOffset), src=vgpr(colOffset), comment="apply swizzling")) + setExecMask(module, writer, -1, -1) + else: + module.add(VMovB32(dst=vgpr(colOffset), src=vgpr(lane16Group), comment="colOffset = lane16Group")) + + # Row + module.add(VLShiftLeftB32(dst=vgpr(rowOffset), shiftHex=hex(subIterKBytes.bit_length()-1), src=vgpr(lane16), comment="offsetRow = subIterKBytes*lane16")) + + # Calculate LR offset for A and B + _computeLROffset(module, kernel, tileInfoA, colOffset, rowOffset) + _computeLROffset(module, kernel, tileInfoB, colOffset, rowOffset) + + writer.vgprPool.checkIn(tmpVgpr) + + # Wave partitioning (e.g. 2x2 or 4x1/1x4) + _lraWavePartitioning(module, writer, kernel) + + # Apply global offset on B + MT0A = tileInfoA.globalMMATileGrid[0] * tileInfoA.mmaTileShape[0] + for vgprId in range(len(tileInfoB.sharedVgprLROffset)): + module.add(VAddU32(dst=vgpr(tileInfoB.sharedVgprLROffset[vgprId]), src0=writer.ldsStartOffsetB, src1=vgpr(tileInfoB.sharedVgprLROffset[vgprId]), comment="B matrix offset in LDS")) + + return module + + +def _zeroRegRange(module, writer, tileInfo, firstReg, totalRegs, isAgpr): + """Zero a contiguous register range using MFMA for blocks of 16, scalar writes for remainder.""" + tileAlias = accvgpr if isAgpr else vgpr + tileCopyInst = VAccvgprWrite if isAgpr else VMovB32 + regsPerMfma = 16 + numMfma = totalRegs // regsPerMfma + + if numMfma > 0: + tmpVgpr = writer.vgprPool.checkOutAligned(2, 2) + module.add(VMovB64(dst=vgpr(tmpVgpr, 2), src=0, comment="")) + module.add(SNop(waitState=1, comment="wait for vgpr to be ready before MFMA")) + for i in range(numMfma): + r = firstReg + i * regsPerMfma + module.add(MFMAInstruction(instType=InstType.INST_I8, accType=InstType.INST_I32, + variant=[32, 32, 16, 1], mfma1k=False, + acc=tileAlias(r, regsPerMfma), + a=vgpr(tmpVgpr, 2), b=vgpr(tmpVgpr, 2), + acc2=0, + comment="init%s: [%u:%u]"%(tileInfo.tc, r, r + regsPerMfma - 1))) + writer.vgprPool.checkIn(tmpVgpr) + + for i in range(numMfma * regsPerMfma, totalRegs): + module.add(tileCopyInst(dst=tileAlias(firstReg + i), src=0, comment="init%s"%(tileInfo.tc))) + +def initVgprTilesToZero(writer, kernel, tileInfo): + """Initialize vgprTiles to zero using MFMA for blocks of 16, scalar writes for remainder.""" + module = Module() + module.addComment0("Init %s vgprTiles to zero"%(tileInfo.tc)) + + if not tileInfo.vgprTiles: + return module + + # Group contiguous tiles by pool type (agpr vs vgpr) since D tiles can use both + firstReg = tileInfo.vgprTiles[0].regList.regValues[0] + totalRegs = 0 + curPool = tileInfo.vgprTiles[0].regList.regPool + + for tile in tileInfo.vgprTiles: + pool = tile.regList.regPool + numRegs = len(tile.regList.regValues) + if pool != curPool: + _zeroRegRange(module, writer, tileInfo, firstReg, totalRegs, curPool == writer.agprPool) + firstReg = tile.regList.regValues[0] + totalRegs = numRegs + curPool = pool + else: + totalRegs += numRegs + + _zeroRegRange(module, writer, tileInfo, firstReg, totalRegs, curPool == writer.agprPool) + + return module + + +def localReadResetOffsetsSubtile(writer, kernel): + module = Module() + module.addComment0("REMOVE WHEN IMPLEMNTED: Placeholder for subtile based LR offset reset code") + for i in range(8): + module.addComment("") + + return module + + +################################################## +# Subroutine to generate GR offset calculation code +# +def graInitPointer(writer, kernel): + module = Module() + module.addComment0("REMOVE WHEN IMPLEMNTED: Placeholder for GR base pointer init") + for i in range(8): + module.addComment("") + + return module + + +################################################## +# Compute GR offset for a single matrix (A or B) +# +def _grComputeOffset(module, writer, tileInfo, colId, rowId, output): + tc = tileInfo.tc + bpeBits = int(8*tileInfo.bpe) + + tmpVgpr = writer.vgprPool.checkOut(2) + colBytes = tmpVgpr + 1 + loadWidth = tileInfo.loadWidthGR + + module.add(VLShiftLeftB32(dst=vgpr(colBytes), shiftHex=hex(loadWidth.bit_length()-1), src=vgpr(colId), comment="scale col_id by load_width")) + MT0 = tileInfo.globalMMATileGrid[0] * tileInfo.mmaTileShape[0] + subtileSize = tileInfo.subtileShape[0]*tileInfo.mmaTileShape[0] + strideRef = "StrideA0I" if tc == 'A' else "StrideB1J" + module.add(VMulLOU32(dst=vgpr(tmpVgpr), src0=sgpr(strideRef), src1=vgpr(rowId), comment="%s: rowId * stride"%tc)) + module.add(VLShiftLeftB32(dst=vgpr(tmpVgpr), shiftHex=hex(bpeBits.bit_length()-1), src=vgpr(tmpVgpr), comment="%s: rowId*stride*bpe"%tc)) + module.add(VLShiftRightB32(dst=vgpr(tmpVgpr), shiftHex=hex(3), src=vgpr(tmpVgpr), comment="to bytes")) + module.add(VAddU32(dst=vgpr(output), src0=vgpr(colBytes), src1=vgpr(tmpVgpr), comment="%s: GR row_offset"%tc)) + writer.vgprPool.checkIn(tmpVgpr) + +################################################## +# Compute subtile perpendicular offsets for a single matrix +# +# TODO: need to generalize this to support TLU=1 +def _grComputeSubtileOffsets(writer, module, tileInfo): + tc = tileInfo.tc + strideRef = "StrideA0I" if tc == 'A' else "StrideB1J" + subtile_size = tileInfo.subtileShape[0]*tileInfo.mmaTileShape[0] + # rowOffset between 2 subtiles offset, ie how many consecutive subtile covered by a single subtileOffset. + # rowOffset = numGRPerSubtile * (local load ratio * subtile size) + rowOffset = math.ceil(tileInfo.numGRPerSubtile*tileInfo.loadRatioGR*subtile_size) + s_stride = int(rowOffset * tileInfo.bpe) + + for regId in range(len(tileInfo.localSubtilesRegister)): + regPool = tileInfo.localSubtilesRegister[regId].regPool + for reg in tileInfo.localSubtilesRegister[regId]: + if regPool == writer.sgprPool: + module.add(SMulI32(dst=sgpr(reg), src0=hex(s_stride * regId), src1=sgpr(strideRef), comment="%s: %u rows offset, stride %u, %u"%(tc, rowOffset, s_stride, regId))) + else: + stmp = writer.sgprPool.checkOut(1) + idx = tileInfo.localSubtilesRegister[regId].index(reg) + module.add(SMulI32(dst=sgpr(stmp), src0=hex(s_stride * regId), src1=sgpr(strideRef), comment="%s: %u rows offset, stride %u, %u"%(tc, rowOffset, s_stride, regId))) + module.add(VAddU32(dst=vgpr(reg), src0=vgpr(tileInfo.sharedVgprGROffset[idx]), src1=sgpr(stmp))) + writer.sgprPool.checkIn(stmp) + +# Compute wave partition offset for a single tile (A or B) +# +def _grComputeRowPartition(module, kernel, writer, tileInfo, waveId, rowOffset): + subIterKBytes = tileInfo.subIterKBytes + wavesize = kernel["WavefrontSize"] + loadWidth = tileInfo.loadWidthGR + numRowsPerWave = wavesize // (subIterKBytes // loadWidth) + tc = tileInfo.tc + tmpVgpr = writer.vgprPool.checkOut(2) + tmpSgpr = writer.sgprPool.checkOut(1, preventOverflow=False) + localRow = tmpVgpr + partitionRow = tmpVgpr+1 + partitionOffset = tileInfo.mmaTileShape[0]*tileInfo.localSubtileGrid[0] + module.add(SMovB32(dst=sgpr(tmpSgpr), src=partitionOffset, comment="%s: row offset"%tc)) + + if tileInfo.loadRatioGR == 1.0: + module.add(VAndB32(dst=vgpr(localRow), src0=hex(1), src1=vgpr(waveId), comment="%s: waveId %% 2"%tc)) + module.add(VLShiftRightB32(dst=vgpr(partitionRow), shiftHex=hex(1), src=vgpr(waveId), comment="%s: waveId / 2"%tc)) + elif tileInfo.loadRatioGR == 0.5: + module.add(VMovB32(dst=vgpr(localRow), src=0, comment="%s"%tc)) + module.add(VMovB32(dst=vgpr(partitionRow), src=vgpr(waveId), comment="%s"%tc)) + elif tileInfo.loadRatioGR == 2.0: + module.add(VMovB32(dst=vgpr(localRow), src=vgpr(waveId), comment="%s"%tc)) + module.add(VMovB32(dst=vgpr(partitionRow), src=0, comment="%s"%tc)) + else: + raise NotImplementedError("Unsupported loadRatioGR for wave partition: %s"%str(tileInfo.loadRatioGR)) + + module.add(VLShiftLeftB32(dst=vgpr(localRow), shiftHex=hex(numRowsPerWave.bit_length()-1), src=vgpr(localRow), comment="%s: local row offset"%tc)) + module.add(VMulLOU32(dst=vgpr(partitionRow), src0=sgpr(tmpSgpr), src1=vgpr(partitionRow), comment="%s: wave row offset"%tc)) + module.add(VAddU32(dst=vgpr(rowOffset), src0=vgpr(localRow), src1=vgpr(partitionRow), comment="%s: row offset"%tc)) + + + writer.vgprPool.checkIn(tmpVgpr) + writer.sgprPool.checkIn(tmpSgpr) + +################################################## +# Compute GR offsets for all subtiles of a single matrix (A or B) +# +def _grComputeAllOffsets(module, writer, tileInfo, colId, rowId, rowOffset): + module.add(VAddU32(dst=vgpr(rowOffset), src0=vgpr(rowId), src1=vgpr(rowOffset), comment="%s: row offset"%tileInfo.tc)) + _grComputeOffset(module, writer, tileInfo, colId, rowOffset, tileInfo.sharedVgprGROffset[0]) + for i in range(1, len(tileInfo.sharedVgprGROffset)): + subtileSize = tileInfo.subtileShape[0] * tileInfo.mmaTileShape[0] + offset = math.ceil(subtileSize * tileInfo.loadRatioGR) + module.add(VAddU32(dst=vgpr(rowOffset), src0=offset, src1=vgpr(rowOffset), comment="%s: advance row for GR offset %u"%(tileInfo.tc, i))) + + # Apply Rotation on entire wave. Only applies to 4x case as a subtile is loaded by a single wave in 2 steps. (waveId rotation not applied) + rotatedcolId = writer.vgprPool.checkOut(1) + loadWidth = tileInfo.loadWidthGR + if tileInfo.loadRatioGR == 0.5: + blockSize = tileInfo.subIterKBytes // loadWidth + module.add(VAddU32(dst=vgpr(rotatedcolId), src0=4, src1=vgpr(colId), comment="%s: advance row for GR offset %u"%(tileInfo.tc, i))) + module.add(VAndB32(dst=vgpr(rotatedcolId), src0=vgpr(rotatedcolId), src1=hex(blockSize-1), comment="(col + offset) % block_size")) + else: + module.add(VMovB32(dst=vgpr(rotatedcolId), src=vgpr(colId), comment="")) + + _grComputeOffset(module, writer, tileInfo, rotatedcolId, rowOffset, tileInfo.sharedVgprGROffset[i]) + writer.vgprPool.checkIn(rotatedcolId) + +################################################## +# Apply swizzling and rotation to col IDs for GR offset calculation. +# +# Swizzling reorders column indices to avoid LDS bank conflicts. +# Two levels of rotation are applied to the column IDs: +# 1. Intra-wave rotation: rotates colId based on the LDS row id within +# a single wave. The rotation offset is: blockSize - (ldsRowId // 2) * 2. +# This ensures consecutive rows access different LDS banks. +# 2. Inter-wave rotation: an additional per-wave offset derived from waveId +# shifts the column further so that different waves also avoid bank +# conflicts with each other. Only applied when loadRatioGR != 0.5 +# (i.e. when multiple waves share the same subtile region). +# +def _grSwizzleColIds(module, writer, tileInfoA, tileInfoB, blockSize, numRowsPerLDSBanks, + laneId, colIdA, colIdB, waveId): + tmpVgpr = writer.vgprPool.checkOut(3) + ldsRowId = tmpVgpr + tmp = tmpVgpr + 1 + waveRotation = tmpVgpr + 2 + + module.addComment0("Swizzling") + module.add(VLShiftRightB32(dst=vgpr(ldsRowId), shiftHex=hex(blockSize.bit_length()-1), src=vgpr(laneId), comment="row id within wave")) + module.add(VLShiftRightB32(dst=vgpr(ldsRowId), shiftHex=hex(numRowsPerLDSBanks.bit_length()-1), src=vgpr(ldsRowId), comment="lds row id")) + module.add(VAndB32(dst=vgpr(tmp), src0=vgpr(ldsRowId), src1=hex(1), comment="lds row id % 2")) + module.add(VCmpXEqU32(dst=VCC(), src0=0, src1=vgpr(tmp), comment="lds row id % 2 == 0 ?")) + module.add(VMovB32(dst=vgpr(colIdA), src=vgpr(colIdA), dpp=DPPModifiers(quad_perm=[1,0,3,2]), comment="swap colId pairs for swizzling")) + module.add(SMovB64(dst=EXEC(), src=-1)) + module.add(VMovB32(dst=vgpr(colIdB), src=vgpr(colIdA), comment="")) + module.addComment0("Rotation within a single wave") + # wave rotation + module.add(VLShiftRightB32(dst=vgpr(tmp), shiftHex=hex(1), src=vgpr(ldsRowId), comment="")) + module.add(VLShiftLeftB32(dst=vgpr(tmp), shiftHex=hex(1), src=vgpr(tmp), comment="(ldsRowId //2) * 2")) + module.add(VSubU32(dst=vgpr(tmp), src0=hex(blockSize), src1=vgpr(tmp), comment="rotation offset : blockSize - (ldsRowId//2)*2")) + + for tInfo, cId in [(tileInfoA, colIdA), (tileInfoB, colIdB)]: + if tInfo.loadRatioGR != 0.5: + module.addComment0("Rotation per wave") + module.add(VAndB32(dst=vgpr(waveRotation), src0=vgpr(waveId), src1=hex(1), comment="")) + module.add(VLShiftLeftB32(dst=vgpr(waveRotation), shiftHex=hex((2*numRowsPerLDSBanks).bit_length() - 1), src=vgpr(waveRotation), comment="")) + module.add(VSubU32(dst=vgpr(waveRotation), src0=vgpr(tmp), src1=vgpr(waveRotation), comment="")) + module.add(VAddU32(dst=vgpr(cId), src0=vgpr(waveRotation), src1=vgpr(cId), comment="")) + else: + module.add(VAddU32(dst=vgpr(cId), src0=vgpr(tmp), src1=vgpr(cId), comment="")) + + module.add(VAndB32(dst=vgpr(colIdA), src0=vgpr(colIdA), src1=hex(blockSize-1), comment="(col + offset) % block_size")) + module.add(VAndB32(dst=vgpr(colIdB), src0=vgpr(colIdB), src1=hex(blockSize-1), comment="(col + offset) % block_size")) + + writer.vgprPool.checkIn(tmpVgpr) + +################################################## +# Subroutine to generate GR offset calculation code +# +def graTileAssignment(writer, kernel, useSwizzling=True): + module = Module() + module.addComment0("GR Offset Calculation for Subtile Based Tiling") + + tileInfoA = writer.states.a.tileInfo + tileInfoB = writer.states.b.tileInfo + + # Input Parameters. + subIterKBytes = tileInfoA.subIterKBytes + wavesize = kernel["WavefrontSize"] + ldsRowBankSize = 64 * 4 # 64 banks, 4 bytes per bank. + + loadWidth = tileInfoA.loadWidthGR # Assumes loadwidth for A/B tiles are the same + assert subIterKBytes % loadWidth == 0, "subIterKBytes (%u) must be a multiple of loadWidth (%u)" % (subIterKBytes, loadWidth) + assert subIterKBytes <= ldsRowBankSize, "Only support subIterKBytes smaller than %u (lds row bank size) for now"%ldsRowBankSize + blockSize = subIterKBytes // loadWidth + + numRowsPerLDSBanks = ldsRowBankSize // subIterKBytes + + tmpVgpr = writer.vgprPool.checkOut(7) + colIdA = tmpVgpr + colIdB = tmpVgpr + 1 + rowId = tmpVgpr + 2 + rowOffsetA = tmpVgpr + 3 + rowOffsetB = tmpVgpr + 4 + waveId = tmpVgpr + 5 + laneId = tmpVgpr + 6 + + # Compute waveId and laneId + module.add(VLShiftRightB32(dst=vgpr(waveId), shiftHex=hex(wavesize.bit_length()-1), src=vgpr("Serial"), comment="Wave Id")) + module.add(VAndB32(dst=vgpr(laneId), src0=vgpr("Serial"), src1=wavesize-1, comment="")) + # Common code for both A & B + # Calculate col and row id within a wave for 128b loads + module.add(VAndB32(dst=vgpr(colIdA), src0=vgpr("Serial"), src1=(blockSize-1), comment="get col_id in wave for %uB load"%loadWidth)) + module.add(VLShiftRightB32(dst=vgpr(rowId), shiftHex=hex(blockSize.bit_length()-1), src=vgpr(laneId), comment="row id within wave")) + + # Apply swizzling and rotation to colId for A and B + _grSwizzleColIds(module, writer, tileInfoA, tileInfoB, blockSize, numRowsPerLDSBanks, + laneId, colIdA, colIdB, waveId) + + # Compute rowOffsetA and rowOffsetB row offset based on wave partitioning (e.g. 2x2, 4x1/1x4) + _grComputeRowPartition(module, kernel, writer, tileInfoA, waveId, rowOffsetA) + _grComputeRowPartition(module, kernel, writer, tileInfoB, waveId, rowOffsetB) + + # Compute GR offset for A and B + _grComputeAllOffsets(module, writer, tileInfoA, colIdA, rowId, rowOffsetA) + _grComputeAllOffsets(module, writer, tileInfoB, colIdB, rowId, rowOffsetB) + + writer.vgprPool.checkIn(tmpVgpr) + + # Compute subtile offsets for A and B + _grComputeSubtileOffsets(writer, module, tileInfoA) + _grComputeSubtileOffsets(writer, module, tileInfoB) + + return module + +def _getScaleTileInfo(tc, writer, kernel): + """Get MXSA/MXSB tileInfo for matrix tc, or None if MX scaling is inactive.""" + key = "MXBlock%s" % tc + if not kernel["ProblemType"].get(key, 0): + return None + return (writer.states.mxsa.tileInfo if tc == 'A' else writer.states.mxsb.tileInfo) + +################################################## +# Compute the per-thread global-read (DTL) vaddr for scale tensor tc. +# +# With DTL (buffer_load lds=True) the same vaddr serves as: +# - global byte offset from the SRD base (where to read from global memory) +# - LDS byte offset from M0 (where to write in LDS) +# +# Threads within a wave are split into groups of numThreadsPerGroup. +# Each group loads one contiguous subtile-column worth of scale bytes: +# +# groupId = serial / numThreadsPerGroup (which scale column) +# threadId = serial % numThreadsPerGroup (position within group) +# +# grOffset = groupId * stride_bpe (column byte offset via tensor stride) +# + threadId * loadWidth (byte offset within column) +# +# Output: sharedVgprGROffset[0] = grOffset (used as vaddr in DTL load) +# +def _graTileAssignmentScaleSwizzledCommon(tc, writer, kernel): + module = Module() + + module.addComment("Computing GR Offset for %s"%tc) + + tileInfo = writer.states.mxsa.tileInfo if tc == 'MXSA' else writer.states.mxsb.tileInfo + loadWidth = tileInfo.loadWidthGR + loadWidthShift = loadWidth.bit_length() - 1 + + # TODO: this logic assumes scales are in block TLU=0 format. + # Scale groups span 2 M-adjacent subtiles (matching the physical 32-row scale blocks), + # so multiply subtileSize by 2 to get the actual bytes per group. + scaleGroupSize = 2 * tileInfo.subtileSize # bytes per scale group (2 subtiles in dim0) + # number of consecutive threads needed to load all subtiles in contiguous dim + numThreadsPerGroup = (scaleGroupSize * tileInfo.localSubtileGrid[1]) // loadWidth + + vtmp = writer.vgprPool.checkOut(1) + + stmp = writer.sgprPool.checkOut(1) + + module.add(VLShiftRightB32(dst=vgpr(vtmp), + shiftHex=hex(int(math.log2(numThreadsPerGroup))), src=vgpr("Serial"), + comment="%s: grOffset = serial / %d" % (tc, loadWidth))) + module.add(SLShiftLeftB32(sgpr(stmp), int(math.log2(tileInfo.bpe)), sgpr("Strides%s"%tc), comment="*= bpe (%d)"%(tileInfo.bpe))) + + module.add(VMulLOU32(dst=vgpr(vtmp), src1=vgpr(vtmp), src0=sgpr(stmp), comment="Apply scale%s stride to each group"%tc)) + module.add(VAndB32(dst=vgpr(tileInfo.sharedVgprGROffset[0]), + src0=hex(numThreadsPerGroup - 1), src1=vgpr("Serial"), + comment="%s: grOffset = serial %% %d" % (tc, loadWidth))) + module.add(VLShiftLeftB32(dst=vgpr(tileInfo.sharedVgprGROffset[0]), + shiftHex=hex(loadWidthShift), src=vgpr(tileInfo.sharedVgprGROffset[0]), + comment="Scale by load width for each thread in group")) + module.add(VAddU32(dst=vgpr(tileInfo.sharedVgprGROffset[0]), src0=vgpr(tileInfo.sharedVgprGROffset[0]), src1=vgpr(vtmp), comment="Final offset calc")) + writer.vgprPool.checkIn(vtmp) + writer.sgprPool.checkIn(stmp) + + return module + +################################################## +# Generate GR offset calculation for scaleA/B (DTL). +# +# With DTL, vaddr serves as both the global read offset (from SRD) +# and the LDS write offset (from M0). Simple linear access: +# grOffset = serial * scaleLoadWidth +# +def graTileAssignmentScaleSwizzled(writer, kernel): + module = Module() + + if not kernel["ProblemType"].get("MXBlockA", 0) and not kernel["ProblemType"].get("MXBlockB", 0): + module.addComment0("Scale GR tile assignment: skipped (no MX block scaling)") + return module + + # DTL linear offset: vaddr = serial * scaleLoadWidth (= serial << log2(loadWidth)) + module.add(_graTileAssignmentScaleSwizzledCommon('MXSA', writer, kernel)) + module.add(_graTileAssignmentScaleSwizzledCommon('MXSB', writer, kernel)) + + return module + + +################################################## +# Apply wave partition offset for scale LR. +# +# Each wave reads from its assigned LDS partition for scale A or B. +# +# MXSA: partition index = waveId % MIWaveGroup[0] (M-direction wave index) +# MXSB: partition index = waveId / MIWaveGroup[0] (N-direction wave index) +# Using MIWaveGroup[0] (not [1]) correctly handles asymmetric configs +# (e.g. 4x1: all 4 M-waves share the same N partition → index = 0). +# +# Output: sharedVgprLROffset[0] = partitionIndex * totalScaleBytes +# +def _applyScaleWavePartitionLROffset(module, writer, kernel, tileInfo, waveId): + tc = tileInfo.tc + + # Partition stride is based on actual scale data size, not GR load capacity. + # Mirrors data tile partition (_applyWavePartitionLROffset) which uses + # MT * depthUBytes // numPartitions. + # TODO: Calculate num of rows in subtile instead of hardcoding + scaleSubtileBytes = tileInfo.subtileSize * tileInfo.bpe + # Note MMATile format is always [NonK dim, K dim] + MT = tileInfo.globalMMATileGrid[0] // tileInfo.subtileShape[0] + index = 0 if tc == 'MXSA' else 1 + totalScaleBytes = (MT // kernel["MIWaveGroup"][index]) * (tileInfo.localSubtileGrid[1]) * scaleSubtileBytes + + tmpSgpr = writer.sgprPool.checkOut(1) + tmp = writer.vgprPool.checkOut(2) + + if tc == 'MXSA': + module.add(VAndB32(dst=vgpr(tmp), src0=kernel["MIWaveGroup"][0]-1, src1=vgpr(waveId), comment="scale%s: waveId %% 2"%tc)) + else: + # N-direction wave index = waveId / numWavesInM (MIWaveGroup[0]) + # Using MIWaveGroup[0] (not [1]) correctly handles asymmetric configs like 4x1 + # where log2(MIWaveGroup[1])=0 would give waveId unchanged instead of waveId/4. + module.add(VLShiftRightB32(dst=vgpr(tmp), shiftHex=int(math.log2(kernel["MIWaveGroup"][0])), src=vgpr(waveId), comment="scale%s: waveId / numWavesM"%tc)) + + module.add(SMovB32(dst=sgpr(tmpSgpr), src=totalScaleBytes, comment="scale%s: scale region"%tc)) + module.add(VMulLOU32(dst=vgpr(tileInfo.sharedVgprLROffset[0]), src0=sgpr(tmpSgpr), src1=vgpr(tmp), comment="scale%s: partition offset"%tc)) + + writer.vgprPool.checkIn(tmp) + writer.sgprPool.checkIn(tmpSgpr) + + +################################################## +# Generate LR offset calculation for scaleA/B. +# +# Computes the per-lane LDS read offset for scale tensors. Called once +# during kernel setup; the resulting VGPRs are used every loop iteration. +# +# Final LR offset per lane: +# lrOffset[lane] = wavePartitionOffset + laneId * 4 + ldsStartOffset +# +# where: +# wavePartitionOffset = partitionIndex * totalScaleBytes +# MXSA partitionIndex = waveId % MIWaveGroup[0] (M-direction) +# MXSB partitionIndex = waveId / MIWaveGroup[0] (N-direction) +# laneId = serial & (wavesize - 1) +# ldsStartOffset = writer.ldsStartOffsetMXSA/B +# +# LDS layout (double-buffered, one buffer shown): +# [ DataA | DataB | ScaleA | ScaleB ] +# ScaleA starts at ldsStartOffsetMXSA, ScaleB at ldsStartOffsetMXSB. +# +# After the LR offset is fully computed, the double-buffer swap VGPR is +# initialised here (not in localReadDTLInitCommonSwapVgpr, which runs +# before this function and would use uninitialised values): +# swapVgpr = lrOffset XOR (lrOffset + ldsTotalSize) +# This lets localReadLDSBufferSwap toggle between buffer 0 and buffer 1. +# +def lraTileAssignmentScaleSwizzled(writer, kernel): + module = Module() + + if not kernel["ProblemType"].get("MXBlockA", 0) and not kernel["ProblemType"].get("MXBlockB", 0): + module.addComment0("Scale LR tile assignment: skipped (no MX block scaling)") + return module + + mxsaTileInfo = writer.states.mxsa.tileInfo + mxsbTileInfo = writer.states.mxsb.tileInfo + + module.addComment0("LR Offset Calculation for Scale Tensors") + + wavesize = kernel["WavefrontSize"] + + # Wave partitioning + waveIdVgpr = writer.vgprPool.checkOut(1) + module.add(VLShiftRightB32(dst=vgpr(waveIdVgpr), shiftHex=hex(wavesize.bit_length()-1), src=vgpr("Serial"), comment="scale: waveId")) + + _applyScaleWavePartitionLROffset(module, writer, kernel, mxsaTileInfo, waveIdVgpr) + _applyScaleWavePartitionLROffset(module, writer, kernel, mxsbTileInfo, waveIdVgpr) + writer.vgprPool.checkIn(waveIdVgpr) + + # Per-lane offset: laneId * sizeof(dword) = (serial & (wavesize-1)) << 2 + laneOffset = writer.vgprPool.checkOut(1) + module.add(VAndB32(dst=vgpr(laneOffset), src0=vgpr("Serial"), src1=wavesize-1, comment="scale: laneId")) + module.add(VLShiftLeftB32(dst=vgpr(laneOffset), shiftHex=hex(2), src=vgpr(laneOffset), comment="scale: laneId * 4")) + + module.add(VAddU32(dst=vgpr(mxsaTileInfo.sharedVgprLROffset[0]), src0=vgpr(laneOffset), src1=vgpr(mxsaTileInfo.sharedVgprLROffset[0]), comment="scaleA: lrOffset = laneId * 4")) + module.add(VAddU32(dst=vgpr(mxsbTileInfo.sharedVgprLROffset[0]), src0=vgpr(laneOffset), src1=vgpr(mxsbTileInfo.sharedVgprLROffset[0]), comment="scaleB: lrOffset = laneId * 4")) + writer.vgprPool.checkIn(laneOffset) + + + # Apply global LDS offset for A scale (scale A follows data A+B in LDS) + tmpSgpr = writer.sgprPool.checkOut(1) + module.add(SMovB32(dst=sgpr(tmpSgpr), src=hex(writer.ldsStartOffsetMXSA), comment="scale: LDS offset for A scale")) + module.add(VAddU32(dst=vgpr(mxsaTileInfo.sharedVgprLROffset[0]), src0=vgpr(mxsaTileInfo.sharedVgprLROffset[0]), src1=sgpr(tmpSgpr), comment="scaleA: +=LDS offset")) + + module.add(SMovB32(dst=sgpr(tmpSgpr), src=hex(writer.ldsStartOffsetMXSB), comment="scale: LDS offset for B scale")) + module.add(VAddU32(dst=vgpr(mxsbTileInfo.sharedVgprLROffset[0]), src0=vgpr(mxsbTileInfo.sharedVgprLROffset[0]), src1=sgpr(tmpSgpr), comment="scaleB: +=LDS offset")) + + # Init scale LR swap VGPRs here, after the LR offsets are fully computed. + # (Must NOT be done in localReadDTLInitCommonSwapVgpr, which runs before this function.) + module.add(SMovB32(dst=sgpr(tmpSgpr), src=writer.ldsTotalSize, comment="scale: total LDS size for swap")) + for tileInfo in [mxsaTileInfo, mxsbTileInfo]: + for i in range(len(tileInfo.sharedVgprLROffset)): + vgprId = tileInfo.sharedVgprLROffset[i] + vgprSwapId = tileInfo.sharedVgprLROffsetSwap[i] + module.add(VAddU32(dst=vgpr(vgprSwapId), src0=vgpr(vgprId), src1=sgpr(tmpSgpr), comment="scale%s: LR swap"%tileInfo.tc)) + module.add(VXorB32(dst=vgpr(vgprSwapId), src0=vgpr(vgprId), src1=vgpr(vgprSwapId), comment="scale%s: LR swap"%tileInfo.tc)) + + writer.sgprPool.checkIn(tmpSgpr) + + return module + +################################################## +# Scale GR: Load scale bytes from global memory directly to LDS (DTL). +# +# Uses BufferLoadB128 with lds=True. M0 is set to scaleLdsBase, and +# sharedVgprGROffset[0] = serial * scaleLoadWidth serves as both the +# global read offset (from SRD) and the LDS write offset (from M0). +def globalReadDoScaleSubtile(tc, writer, kernel): + module = Module() + + if not kernel["ProblemType"].get("MXBlockA", 0) and not kernel["ProblemType"].get("MXBlockB", 0): + return module + + tileInfo = writer.states.mxsa.tileInfo if tc == 'MXSA' else writer.states.mxsb.tileInfo + + isGlc = bool(kernel["NonTemporal%s"%tc] & 0x1) + isSlc = bool(kernel["NonTemporal%s"%tc] & 0x2) + isNT = bool(kernel["NonTemporal%s"%tc] & 0x4) + + assert len(tileInfo.sharedVgprGROffset) > 0, "Scale GR requires at least 1 GR offset VGPR" + + module.addComment0("Scale GR: %s (DTL: BufferLoadB128 -> LDS)" % tc) + + # Set M0 to scale LDS base address for DTL write destination + module.add(SMovB32(dst=mgpr(0), src=sgpr("LocalWriteBaseAddr%s"%tc), + comment="scale%s: M0 = scaleLdsBase" % tc)) + + # DTL load: data goes directly from global memory to LDS (no intermediate VGPR) + mubuf = MUBUFModifiers(offen=True, offset12=0, glc=isGlc, slc=isSlc, nt=isNT, lds=True) + module.add(BufferLoadB128(dst=None, vaddr=vgpr(tileInfo.sharedVgprGROffset[0]), + saddr=sgpr("Srd%s" % tc, 4), soffset=0, mubuf=mubuf, + comment="scale%s: DTL b128 load" % tc)) + + return module + +################################################## +# Scale LR: Read scale data from LDS into scale VGPRs (DSLoadB32). +# +# Each lane reads 4 bytes from LDS using ds_read_b32. The base address +# is sharedVgprLROffset[0] (computed by lraTileAssignmentScaleSwizzled). +# MMA tile and subtile selection is done via constant ds_offset at emit time. +# +# Each 32-bit VGPR holds 4 E8M0 scale bytes; opsel/opsel_hi selects +# the correct byte per MFMA invocation. +# +def emitSubtileScaleDsRead(tc, writer, kernel, scaleGroupIdx): + """Emit a single DSLoadB32 for a scale group (2 M-adjacent [1,2] subtiles). + Each ds_read_b32 loads 4 bytes = 4 E8M0 scale values into one VGPR.""" + module = Module() + tileInfo = writer.states.mxsa.tileInfo if tc == 'MXSA' else writer.states.mxsb.tileInfo + + if tileInfo.mxBlock == 0: + return module + + # Each scale group covers 2 M-adjacent subtiles, stride = 2 * subtileSize + groupStride = 2 * tileInfo.subtileSize + dsOffset = groupStride * scaleGroupIdx + vdst = tileInfo.vgprTiles[4 * scaleGroupIdx].regList.regValues[0] + module.add(DSLoadB32(dst=vgpr(vdst), + src=vgpr(tileInfo.sharedVgprLROffset[0]), + ds=DSModifiers(offset=dsOffset), + comment="scale%s[group%u]: load 4B from LDS" % (tc, scaleGroupIdx))) + return module + +def localReadDoScaleSubtile(tc, writer, kernel): + """Emit scale ds_reads for all scale groups (PGR=0 path).""" + module = Module() + + if not kernel["ProblemType"].get("MXBlockA", 0) and not kernel["ProblemType"].get("MXBlockB", 0): + return module + + tileInfo = writer.states.mxsa.tileInfo if tc == 'MXSA' else writer.states.mxsb.tileInfo + + # Iterate over scale groups: one ds_read per 2 M-adjacent subtiles + numScaleGroups = math.ceil(tileInfo.localSubtileGrid[0] / 2) * tileInfo.localSubtileGrid[1] + for gid in range(numScaleGroups): + module.add(emitSubtileScaleDsRead(tc, writer, kernel, gid)) + + return module + +################################################## +# Scale SRD pointer update: advance scale SRD by scaleDepthU * scaleBpe bytes. +# +def globalReadScalePtrUpdates(tc, writer, kernel): + module = Module() + tileInfo = writer.states.mxsa.tileInfo if tc == 'MXSA' else writer.states.mxsb.tileInfo + + if tileInfo.mxBlock == 0: + return module + + inc = 2 * tileInfo.subtileSize * tileInfo.localSubtileGrid[1] + module.addComment0("Scale SRD update: %s += %u" % (tc, inc)) + module.add(SAddU32(dst=sgpr("Srd%s" % tc), src0=sgpr("Srd%s" % tc), src1=inc)) + module.add(SAddCU32(dst=sgpr("Srd%s+1" % tc), src0=sgpr("Srd%s+1" % tc), src1=0)) + + # No need to decrement Srd+2 + # We use fixed value for Srd+2 + + return module + +################################################## +# Subroutine to generate GR load code +# +def emitSingleBufferLoad(tileInfo, kernel, sId0, sId1): + """Emit buffer_load instructions for a single subtile (sId0, sId1). + + When loadRatioGR > 1, multiple local subtiles share the same global read. + Only the first subtile in each group emits the load; others return empty. + + Args: + tileInfo: TileInfo for the tensor component + sId0: Subtile row index + sId1: Subtile column index (K-dimension) + """ + module = Module() + + subtileInfo = tileInfo.localSubtiles[tileInfo.getLocalSubtileLinearId(sId0, sId1)] + grBaseId = subtileInfo.globalReadMap[0] + + # TODO: Still needed for PGR=0 path but not needed by scheduler + # When loadRatioGR > 1, multiple subtiles share one global read. + # Only emit the load for the first subtile of each group. + if tileInfo.loadRatioGR > 1: + linearId = tileInfo.getLocalSubtileLinearId(sId0, sId1) + firstInGroup = int(grBaseId * tileInfo.loadRatioGR) + if linearId != firstInGroup: + return module + + tc = tileInfo.tc + isGlc = bool(kernel["NonTemporal%s"%tc] & 0x1) + isSlc = bool(kernel["NonTemporal%s"%tc] & 0x2) + isNT = bool(kernel["NonTemporal%s"%tc] & 0x4) + + regList = tileInfo.localSubtilesRegister[subtileInfo.regListId] + + offsetK = sId1 * int(tileInfo.mmaTileShape[1] * tileInfo.subtileShape[1] * tileInfo.bpe) + + subtileOffset = math.ceil(tileInfo.loadRatioGR*tileInfo.subtileSize) + WriteBaseAddr = "LocalWriteBaseAddr%s"%tc + # Emit number of buffer loads equal to number of loads needed to load a subtile + for i in range(tileInfo.numGRPerSubtile): + m0Offset = i * subtileOffset + (sId0 + sId1 * tileInfo.globalSubtileGrid[0]) * tileInfo.subtileSize + module.add(SAddU32(dst=mgpr(0), src0=sgpr(WriteBaseAddr), src1=(m0Offset - offsetK))) + mubuf = MUBUFModifiers(offen=True, offset12=offsetK, glc=isGlc, slc=isSlc, nt=isNT, lds=True) + + # Check if the subtile specific registers is SGPR or VGPR + # For SGPR we can keep the same shared vgpr offset and use the soffset field for the subtile specific SGPR + # For VGPR we need to update the apply the subtile-specific constant offset to the VGPR + # the shared VGPR offset is not used for that specific tile, soffset is also set to zero. + useSgpr = subtileInfo.useSgpr + soffset = sgpr(regList.regValues[0]) if len(regList) > 0 and useSgpr else 0 + voff = tileInfo.sharedVgprGROffset[i] if useSgpr or len(regList) == 0 else regList.regValues[i] + module.add(BufferLoadB128(dst=None, vaddr=vgpr(voff), saddr=sgpr("Srd%s"%tc, 4), soffset=soffset, mubuf=mubuf, comment="grBaseId = %u, i= %u"%(grBaseId , i))) + + return module + + +def emitSubtileBufferLoad(tc, writer, kernel, subtileId): + tileInfo = writer.states.a.tileInfo if tc == 'A' else writer.states.b.tileInfo + return emitSingleBufferLoad(tileInfo, kernel, subtileId[0], subtileId[1]) + +################################################## +# Subroutine to generate GR load code +# Initial idea: maybe store asm in modules in a separate obj? +# +def globalReadDoSubtile(tc, writer, kernel): + module = Module() + + tileInfo = writer.states.a.tileInfo if tc == 'A' else writer.states.b.tileInfo + + for j in range(tileInfo.localSubtileGrid[1]): + for i in range(tileInfo.localSubtileGrid[0]): + module.addComment0("Emit load for %s subtile: [%u, %u]"%(tc, i, j)) + module.add(emitSubtileBufferLoad(tc, writer, kernel, [i, j])) + + return module + +def emitSingleDsRead(tileInfo, sId0, sId1, subIterK, dstTile): + """Emit a single DSLoadB128 for one MMA tile within a subtile. + + Args: + tileInfo: TileInfo (for subtileSize, loadRatioGR, sharedVgprLROffset, tc) + sId0: Subtile row index (used for offset computation) + subIterK: subIterK index within the subtile (maps to mfmaC; subtileShape[0]=1 so mfmaR=0) + dstTile: RegisterTileInfo — destination vgpr tile for the load + """ + + # du maps to mfmaC, mfmaR is always 0 (subtileShape[0]=1) + mfmaId = tileInfo.getSubtileShapeLinearId(subIterK, 0) + addrVgpr = tileInfo.sharedVgprLROffset[mfmaId] + + offsetStride = tileInfo.subtileSize + offset = sId0*offsetStride + + offset = offset + sId1 * tileInfo.globalSubtileGrid[0] * offsetStride + + dstVgpr = dstTile.regList.regValues[0] + numRegs = len(dstTile.regList.regValues) + return DSLoadB128( + dst=vgpr(dstVgpr, numRegs), + src=vgpr(addrVgpr), + ds=DSModifiers(offset=offset), + comment="Subtile%s[%u, %u] subIterK=%u" % (tileInfo.tc, sId0, sId1, subIterK)) + + +def emitSubtileDsRead(writer, kernel, tileInfo, subtileId): + + module = Module() + sId0 = subtileId[0] + sId1 = subtileId[1] + + linearId = tileInfo.getLocalSubtileLinearId(sId0, sId1) + subtileInfo = tileInfo.localSubtiles[linearId] + + for du in range(tileInfo.subtileShape[1]): + mfmaId = tileInfo.getSubtileShapeLinearId(du, 0) + dstTile = tileInfo.vgprTiles[subtileInfo.localReadMap[mfmaId]] + module.add(emitSingleDsRead(tileInfo, sId0, sId1, du, dstTile)) + + return module + +################################################## +# Subroutine to generate LR load code +# Initial idea: maybe store asm in modules in a separate obj? +# +def localReadDoSubtile(tc, writer, kernel): + module = Module() + + tileInfo = writer.states.a.tileInfo if tc == 'A' else writer.states.b.tileInfo + + for i in range(tileInfo.localSubtileGrid[0]): + for j in range(tileInfo.localSubtileGrid[1]): + module.add(emitSubtileDsRead(writer, kernel, tileInfo, [i, j])) + + return module + +################################################## +# Subroutine to generate DTL M0 LDS buffer swap +# +def globalReadDTLInitCommonSgpr(writer, kernel): + module = Module() + + wavesize = kernel["WavefrontSize"] + vgprWaveId = writer.vgprPool.checkOut(1) + module.addComment0("Compute shared offsets used by m0 in DTL loads") + module.add(VLShiftRightB32(dst=vgpr(vgprWaveId), shiftHex=hex(wavesize.bit_length()-1), src=vgpr("Serial"), comment="Wave Id")) + + atile = writer.states.a.tileInfo + btile = writer.states.b.tileInfo + + tmpVgpr = writer.vgprPool.checkOut(2) + rowOffsetA = tmpVgpr + rowOffsetB = tmpVgpr + 1 + + _grComputeRowPartition(module, kernel, writer, atile, vgprWaveId, rowOffsetA) + _grComputeRowPartition(module, kernel, writer, btile, vgprWaveId, rowOffsetB) + + subIterKBytes = atile.subIterKBytes + + module.add(VLShiftLeftB32(dst=vgpr(rowOffsetA), shiftHex=hex((subIterKBytes).bit_length()-1), src=vgpr(rowOffsetA), comment="Apply wave-specific offset for A")) + module.add(VLShiftLeftB32(dst=vgpr(rowOffsetB), shiftHex=hex((subIterKBytes).bit_length()-1), src=vgpr(rowOffsetB), comment="Apply wave-specific offset for B")) + + module.add(SNop(waitState=0, comment="Wait for VGPR to be ready")) + module.add(VReadfirstlaneB32(dst=sgpr("LocalWriteBaseAddrA"), src=vgpr(rowOffsetA), comment="Store base LDS offset, will be modified")) + module.add(VReadfirstlaneB32(dst=sgpr("LocalWriteBaseAddrB"), src=vgpr(rowOffsetB), comment="Store base LDS offset, will be modified")) + module.add(SAddU32(dst=sgpr("LocalWriteBaseAddrB"), src0=sgpr("LocalWriteBaseAddrB"), src1=hex(writer.ldsStartOffsetB), comment="")) + + module.add(SAddU32(dst=sgpr("SwapA"), src0=sgpr("LocalWriteBaseAddrA"), src1=writer.ldsTotalSize, comment="")) + module.add(SXorB32(dst=sgpr("SwapA"), src0=sgpr("LocalWriteBaseAddrA"), src1=sgpr("SwapA"), comment="")) + module.add(SAddU32(dst=sgpr("SwapB"), src0=sgpr("LocalWriteBaseAddrB"), src1=writer.ldsTotalSize, comment="")) + module.add(SXorB32(dst=sgpr("SwapB"), src0=sgpr("LocalWriteBaseAddrB"), src1=sgpr("SwapB"), comment="")) + + writer.vgprPool.checkIn(vgprWaveId) + writer.vgprPool.checkIn(tmpVgpr) + + + return module + +################################################## +# Subroutine to generate DTL M0 LDS buffer swap +# +# For Swizzled Scales each wave will collectively stream +# the scale values +# +def globalReadScaleSwizzledDTLInitCommonSgpr(writer, kernel): + module = Module() + + + wavesize = kernel["WavefrontSize"] + vgprWaveId = writer.vgprPool.checkOut(1) + module.addComment0("Compute shared offsets used by m0 in DTL loads") + module.add(VLShiftRightB32(dst=vgpr(vgprWaveId), shiftHex=hex(wavesize.bit_length()-1), src=vgpr("Serial"), comment="Wave Id")) + + mxsatile = writer.states.mxsa.tileInfo + mxsbtile = writer.states.mxsb.tileInfo + + loadWidth = mxsatile.loadWidthGR # Assumes load width for scaleA/B are the same + + bytesPerLoad = loadWidth * wavesize + module.add(VLShiftLeftB32(dst=vgpr(vgprWaveId), shiftHex=hex((bytesPerLoad).bit_length()-1), src=vgpr(vgprWaveId), comment="Apply wave-specific common offset (%u) for A/B"%bytesPerLoad)) + + module.add(SNop(waitState=0, comment="Wait for VGPR to be ready")) + module.add(VReadfirstlaneB32(dst=sgpr("LocalWriteBaseAddrMXSA"), src=vgpr(vgprWaveId), comment="Store base LDS offset, will be modified")) + module.add(VReadfirstlaneB32(dst=sgpr("LocalWriteBaseAddrMXSB"), src=vgpr(vgprWaveId), comment="Store base LDS offset, will be modified")) + module.add(SAddU32(dst=sgpr("LocalWriteBaseAddrMXSA"), src0=sgpr("LocalWriteBaseAddrMXSA"), src1=hex(writer.ldsStartOffsetMXSA), comment="")) + module.add(SAddU32(dst=sgpr("LocalWriteBaseAddrMXSB"), src0=sgpr("LocalWriteBaseAddrMXSB"), src1=hex(writer.ldsStartOffsetMXSB), comment="")) + + module.add(SAddU32(dst=sgpr("SwapMXSA"), src0=sgpr("LocalWriteBaseAddrMXSA"), src1=writer.ldsTotalSize, comment="")) + module.add(SXorB32(dst=sgpr("SwapMXSA"), src0=sgpr("LocalWriteBaseAddrMXSA"), src1=sgpr("SwapMXSA"), comment="")) + module.add(SAddU32(dst=sgpr("SwapMXSB"), src0=sgpr("LocalWriteBaseAddrMXSB"), src1=writer.ldsTotalSize, comment="")) + module.add(SXorB32(dst=sgpr("SwapMXSB"), src0=sgpr("LocalWriteBaseAddrMXSB"), src1=sgpr("SwapMXSB"), comment="")) + + writer.vgprPool.checkIn(vgprWaveId) + return module + + + + +def localReadDTLInitCommonSwapVgpr(writer, kernel): + module = Module() + + atile = writer.states.a.tileInfo + btile = writer.states.b.tileInfo + + stmp = writer.sgprPool.checkOut(1) + module.add(SMovB32(dst=sgpr(stmp), src=writer.ldsTotalSize, comment="Store Total Lds Size for one buffer")) + for i in range(len(atile.sharedVgprLROffset)): + vgprId = atile.sharedVgprLROffset[i] + vgprSwapId = atile.sharedVgprLROffsetSwap[i] + module.add(VAddU32(dst=vgpr(vgprSwapId), src0=vgpr(vgprId), src1=sgpr(stmp), comment="")) + module.add(VXorB32(dst=vgpr(vgprSwapId), src0=vgpr(vgprId), src1=vgpr(vgprSwapId), comment="")) + + for i in range(len(btile.sharedVgprLROffset)): + vgprId = btile.sharedVgprLROffset[i] + vgprSwapId = btile.sharedVgprLROffsetSwap[i] + module.add(VAddU32(dst=vgpr(vgprSwapId), src0=vgpr(vgprId), src1=sgpr(stmp), comment="")) + module.add(VXorB32(dst=vgpr(vgprSwapId), src0=vgpr(vgprId), src1=vgpr(vgprSwapId), comment="")) + + writer.sgprPool.checkIn(stmp) + return module + + + +################################################## +# Subroutine to generate DTL M0 LDS buffer swap +# +def globalReadLDSBufferSwap(tc, writer, kernel): + module = Module() + module.addComment0("Emit code to swap %s GR m0 offsets"%tc) + module.add(SXorB32(dst=sgpr("LocalWriteBaseAddr%s"%tc), src0=sgpr("LocalWriteBaseAddr%s"%tc), src1=sgpr("Swap%s"%tc), comment="")) + return module + +################################################## +# Subroutine to generate DTL M0 LDS buffer swap +# +def localReadLDSBufferSwap(tc, writer, kernel): + module = Module() + + if tc in ['A', 'B']: + tile = writer.states.a.tileInfo if tc == 'A' else writer.states.b.tileInfo + else: + tile = writer.states.mxsa.tileInfo if tc == 'MXSA' else writer.states.mxsb.tileInfo + + module.addComment0("Emit code to swap %s LR vgpr offsets"%tc) + + for i in range(len(tile.sharedVgprLROffset)): + vgprId = tile.sharedVgprLROffset[i] + vgprSwapId = tile.sharedVgprLROffsetSwap[i] + module.add(VXorB32(dst=vgpr(vgprId), src0=vgpr(vgprId), src1=vgpr(vgprSwapId), comment="")) + + return module + +################################################## +# Subroutine to update ptrs +# +def globalReadPtrUpdates(tc, writer, kernel): + module = Module() + tileInfo = writer.states.a.tileInfo if tc == 'A' else writer.states.b.tileInfo + inc = int(tileInfo.localSubtileGrid[1] * tileInfo.mmaTileShape[1] * tileInfo.subtileShape[1] * tileInfo.bpe) + module.add(SAddU32(dst=sgpr("Srd%s"%tc), src0=sgpr("Srd%s"%tc), src1=inc)) + module.add(SAddCU32(dst=sgpr("Srd%s+1"%tc), src0=sgpr("Srd%s+1"%tc), src1=0)) + + return module + +################################################## +# Subroutine to generate MMA Instruction +# Given RegisterTileInfo inputs for A,B,C,D operands +# emit corresponding mfma instruction +# +def emitMfmaInstruction(writer, kernel, vgprTileA, vgprTileB, vgprTileC, vgprTileD, scaleAVgpr=-1, scaleBVgpr=-1, scaleAsel=-1, scaleBsel=-1, comment = ""): + module = Module() + + vgprAStart = vgprTileA.regList.regValues[0] + vgprBStart = vgprTileB.regList.regValues[0] + vgprCStart = vgprTileC.regList.regValues[0] + vgprDStart = vgprTileD.regList.regValues[0] + + opASize = len(vgprTileA.regList.regValues) + opBSize = len(vgprTileB.regList.regValues) + opCSize = len(vgprTileC.regList.regValues) + opDSize = len(vgprTileD.regList.regValues) + + # For subtile kernels with agpr overflow, D/C tiles that spilled to the vgpr + # pool must use vgpr() in the MFMA operands, not accvgpr(). + dIsVgpr = (vgprTileD.regList.regPool == writer.vgprPool) + cIsVgpr = (vgprTileC.regList.regPool == writer.vgprPool) + dAccAlias = vgpr if (dIsVgpr or kernel["MIArchVgpr"]) else accvgpr + cAccAlias = vgpr if (cIsVgpr or kernel["MIArchVgpr"]) else accvgpr + + aOperand = vgpr(vgprBStart,opBSize) if kernel["SourceSwap"] else vgpr(vgprAStart,opASize) + bOperand = vgpr(vgprAStart,opASize) if kernel["SourceSwap"] else vgpr(vgprBStart,opBSize) + + miK = kernel["MatrixInstK"] + + if miK == 128: + # MX FP4: 16x16x128 + if scaleAVgpr >= 0 and scaleBVgpr >= 0: + # Use actual loaded scale VGPRs + module.add(MXMFMAInstruction(instType=InstType.INST_F4, accType=InstType.INST_F32, variant=[16,16,miK,1], \ + acc=dAccAlias(vgprDStart,opDSize), \ + a=aOperand, \ + b=bOperand, \ + acc2=cAccAlias(vgprCStart,opCSize), \ + mxsa=vgpr(scaleAVgpr), mxsb=vgpr(scaleBVgpr), \ + vop3=VOP3PModifiers(op_sel=[scaleAsel%2, scaleBsel%2], op_sel_hi=[(scaleAsel>>1)%2, (scaleBsel>>1)%2]), \ + comment=comment)) + else: + # Fallback: hardcoded scale 0x7f (scale=1.0 for all elements) + tmpVgprScale = writer.vgprPool.checkOut(1) + module.add(VMovB32(dst=vgpr(tmpVgprScale), src=hex(0x7f7f7f7f), comment="hardcoded scale 0x7f (E8M0)")) + module.add(MXMFMAInstruction(instType=InstType.INST_F4, accType=InstType.INST_F32, variant=[16,16,miK,1], \ + acc=dAccAlias(vgprDStart,opDSize), \ + a=aOperand, \ + b=bOperand, \ + acc2=cAccAlias(vgprCStart,opCSize), \ + mxsa=vgpr(tmpVgprScale), mxsb=vgpr(tmpVgprScale), \ + comment=comment)) + writer.vgprPool.checkIn(tmpVgprScale) + else: + # BF16: 16x16x32 + module.add(MFMAInstruction(instType=InstType.INST_BF16, accType=InstType.INST_F32, variant=[16,16,miK,1], mfma1k=False, \ + acc=dAccAlias(vgprDStart,opDSize), \ + a=aOperand, \ + b=bOperand, \ + acc2=cAccAlias(vgprCStart,opCSize), \ + comment=comment)) + + return module + + +################################################## +# Subroutine to generate MMA code +# Initial idea: maybe store asm in modules in a separate obj? +# +def emitMfmaCode(writer, kernel): + module = Module() + + atileInfo = writer.states.a.tileInfo + btileInfo = writer.states.b.tileInfo + dtileInfo = writer.states.d.tileInfo + + mxsatileInfo = writer.states.mxsa.tileInfo if kernel["ProblemType"].get("MXBlockA", 0) > 0 else None + mxsbtileInfo = writer.states.mxsb.tileInfo if kernel["ProblemType"].get("MXBlockB", 0) > 0 else None + + # Use loaded scale VGPRs when MX block scaling is active. + # Note: scaleVgprTiles is only populated by the scheduler path; + # in the non-scheduler path we use vgprTiles (populated by localReadDoScaleSubtile). + hasScaleA = mxsatileInfo is not None and mxsatileInfo.mxBlock > 0 + hasScaleB = mxsbtileInfo is not None and mxsbtileInfo.mxBlock > 0 + + for mmak in range(atileInfo.localMMATileGrid[1]): + for mma1 in range(btileInfo.localMMATileGrid[0]): + for mma0 in range(atileInfo.localMMATileGrid[0]): + + aSId0, aSId1 = atileInfo.getLocalSubtileIdFromMMATile(mma0, mmak) + bSId0, bSId1 = btileInfo.getLocalSubtileIdFromMMATile(mma1, mmak) + _mma0 = mma0 % atileInfo.subtileShape[0] + _mma1 = mma1 % btileInfo.subtileShape[0] + _mmak = mmak % atileInfo.subtileShape[1] + + numMmaTilePerSubtileA = atileInfo.subtileShape[0] * atileInfo.subtileShape[1] + numMmaTilePerSubtileB = btileInfo.subtileShape[0] * btileInfo.subtileShape[1] + + # TODO: Fix mma index calc for larger subtile shapes + atileId = atileInfo.getLocalSubtileLinearId(aSId0, aSId1) * numMmaTilePerSubtileA + (_mmak) + btileId = btileInfo.getLocalSubtileLinearId(bSId0, bSId1) * numMmaTilePerSubtileB + (_mmak) + + atiles = atileInfo.vgprTiles[atileId] + btiles = btileInfo.vgprTiles[btileId] + dtiles = dtileInfo.vgprTiles[mma0 + mma1 * dtileInfo.localMMATileGrid[0]] + + + if hasScaleA: + mxsatileInfo = writer.states.mxsa.tileInfo + mxsbtileInfo = writer.states.mxsb.tileInfo + # Scale group index: one VGPR per 2 M-adjacent subtiles (ds_read_b32 loads 4 bytes) + subtileKShape = atileInfo.subtileShape[1] + subtileKGrid = atileInfo.localSubtileGrid[1] + scaleGroupA = (mma0 // 2) * subtileKGrid + mmak // subtileKShape + scaleGroupB = (mma1 // 2) * subtileKGrid + mmak // subtileKShape + + scaleAVgpr = mxsatileInfo.vgprTiles[4 * scaleGroupA].regList.regValues[0] if mxsatileInfo.mxBlock else -1 + scaleBVgpr = mxsbtileInfo.vgprTiles[4 * scaleGroupB].regList.regValues[0] if mxsbtileInfo.mxBlock else -1 + + sAsel = (mma0 % 2) + 2 * (mmak % 2) + sBsel = (mma1 % 2) + 2 * (mmak % 2) + else: + scaleAVgpr = -1 + scaleBVgpr = -1 + sAsel = sBsel = -1 + + module.add(emitMfmaInstruction(writer, kernel, atiles, btiles, dtiles, dtiles, + scaleAVgpr=scaleAVgpr, scaleBVgpr=scaleBVgpr, scaleAsel=sAsel, scaleBsel=sBsel, + comment="Emit MMFA code for MMA tiles C[%u, %u] += A[%u, %u] * B[%u, %u] sA = %u, sB = %u"%(mma0, mma1, mma0, mmak, mmak, mma1, sAsel, sBsel))) + + return module + + +################################################## +# Subroutine entry point for main loop impl +# +# This should be shared logic for both main loop and nnl loops +# It would be nice to have this support generic loop unroll +# and possibly SIMD spec paths +# +# Scheduling logic would be introduced here +# +def mainLoopImplPGR0(writer, kernel, isNLL = False): + module = Module() + module.addComment0("REMOVE WHEN IMPLEMNTED: Placeholder for subtile based main loop impl") + + hasMXScale = kernel["ProblemType"].get("MXBlockA", 0) and kernel["ProblemType"].get("MXBlockB", 0) + + label = Label("start", comment="") + module.add(label) + + if not isNLL: + module.add(globalReadDoSubtile('A', writer, kernel)) + module.add(globalReadDoSubtile('B', writer, kernel)) + if hasMXScale: + # Scale GR: load scale data from global to LDS (non-DTL) + module.add(globalReadDoScaleSubtile('MXSA', writer, kernel)) + module.add(globalReadDoScaleSubtile('MXSB', writer, kernel)) + module.add(SWaitCnt(dscnt=-1, vlcnt=0, vscnt=-1, comment="Wait for all subtile GRs to complete")) + module.add(SBarrier(comment="")) + + module.add(localReadDoSubtile('A', writer, kernel)) + module.add(localReadDoSubtile('B', writer, kernel)) + if hasMXScale: + # Scale LR: load scale data from LDS to VGPRs + module.add(localReadDoScaleSubtile('MXSA', writer, kernel)) + module.add(localReadDoScaleSubtile('MXSB', writer, kernel)) + module.add(SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for all subtile LRs to complete")) + + module.add(emitMfmaCode(writer, kernel)) + module.add(globalReadLDSBufferSwap('A', writer, kernel)) + module.add(globalReadLDSBufferSwap('B', writer, kernel)) + + if hasMXScale: + module.add(globalReadLDSBufferSwap('MXSA', writer, kernel)) + module.add(globalReadLDSBufferSwap('MXSB', writer, kernel)) + + module.add(localReadLDSBufferSwap('A', writer, kernel)) + module.add(localReadLDSBufferSwap('B', writer, kernel)) + + + if hasMXScale: + module.add(localReadLDSBufferSwap('MXSA', writer, kernel)) + module.add(localReadLDSBufferSwap('MXSB', writer, kernel)) + + module.add(globalReadPtrUpdates('A', writer, kernel)) + module.add(globalReadPtrUpdates('B', writer, kernel)) + if hasMXScale: + # Scale SRD pointer updates + module.add(globalReadScalePtrUpdates('MXSA', writer, kernel)) + module.add(globalReadScalePtrUpdates('MXSB', writer, kernel)) + + module.add(SSubU32(dst=sgpr("LoopCounterL"), src0=sgpr("LoopCounterL"), src1=1)) + module.add(SCmpEQU32(src0=sgpr("LoopCounterL"), src1=0)) + module.add(SCBranchSCC0(labelName=label.getLabelName())) + + return module + + +################################################## +# Subroutine entry point for preloop +# +# We will need to support different PGR values +# We will need to support different PLR values +# +def preLoop(writer, kernel): + module = Module() + module.addComment("") + module.addComment("") + pgr = kernel["PrefetchGlobalRead"] + plr = kernel["PrefetchLocalRead"] + module.addComment0("REMOVE WHEN IMPLEMNTED: Placeholder for subtile based Preloop code with PGR=%u"%pgr) + + # Just sample impl, we can also interleave A/B loads + for i in range(pgr): + module.addComment0("Emitting %u-th set of GRs"%i) + module.add(globalReadDoSubtile('A', writer, kernel)) + module.add(globalReadDoSubtile('B', writer, kernel)) + # Scale GR in preloop + module.add(globalReadDoScaleSubtile('A', writer, kernel)) + module.add(globalReadDoScaleSubtile('B', writer, kernel)) + module.addComment("Add appropriate GR offset swap logic") + module.addComment("") + + for i in range(plr): + module.addComment("Add correct waits..") + module.addComment0("Emitting LR to read data loaded by %u-th set of GRs"%(i)) + module.add(localReadDoSubtile('A', writer, kernel)) + module.add(localReadDoSubtile('B', writer, kernel)) + # Scale LR in preloop + module.add(localReadDoScaleSubtile('A', writer, kernel)) + module.add(localReadDoScaleSubtile('B', writer, kernel)) + module.addComment("Add appropriate LR offset swap logic") + + module.addComment("") + return module + +################################################## +# Subroutine entry point for main loop +# +# +def mainLoop(writer, kernel): + module = Module() + pgr = kernel["PrefetchGlobalRead"] + assert pgr in (0, 2), "SubtileBasedKernel only supports PGR=0 and PGR=2, got PGR=%d" % pgr + + # PGR=2 pipelining with LogicalScheduler + if pgr == 2: + from Tensile.Components.SubtileBasedLogicalScheduler import ( + LogicalScheduler, SchedulerConfig as MFMASchedulerConfig, + ReadGranularity) + tiA = writer.states.a.tileInfo + tiB = writer.states.b.tileInfo + scaleTiA = writer.states.mxsa.tileInfo if kernel["ProblemType"].get("MXBlockA", 0) else None + scaleTiB = writer.states.mxsb.tileInfo if kernel["ProblemType"].get("MXBlockB", 0) else None + + lrAGran = ReadGranularity(mn=1, k=1) + lrBGran = ReadGranularity(mn=1, k=1) + # Based on current subtile shape. loadRatioGR == 2.0 has 2x2 granularity. + grAGran = ReadGranularity(mn=1, k=2) if tiA.loadRatioGR <= 1.0 else ReadGranularity(mn=2, k=2) + grBGran = ReadGranularity(mn=1, k=2) if tiB.loadRatioGR <= 1.0 else ReadGranularity(mn=2, k=2) + lrSAGran = ReadGranularity(mn=2, k=2) if scaleTiA else None + lrSBGran = ReadGranularity(mn=2, k=2) if scaleTiB else None + grSAGran = ReadGranularity(mn=scaleTiA.localMMATileGrid[0], k=scaleTiA.localMMATileGrid[1]) if scaleTiA else None + grSBGran = ReadGranularity(mn=scaleTiB.localMMATileGrid[0], k=scaleTiB.localMMATileGrid[1]) if scaleTiB else None + + vgprBudget = writer.states.regCaps["MaxVgpr"] + vgprUsed = writer.vgprPool.size() - writer.vgprPool.available() + + for numPartM, numPartN in MFMASchedulerConfig.get_partition_candidates(tiA, tiB): + cfg = MFMASchedulerConfig( + numMFMATilesM=tiA.localMMATileGrid[0], + numMFMATilesN=tiB.localMMATileGrid[0], + numSubIterK=tiA.localMMATileGrid[1], + lrA=lrAGran, + lrB=lrBGran, + grA=grAGran, + grB=grBGran, + lrSA=lrSAGran, + lrSB=lrSBGran, + grSA=grSAGran, + grSB=grSBGran, + numPartitionsM=numPartM, + numPartitionsN=numPartN, + ) + scheduler = LogicalScheduler(cfg) + scheduler.build() + + numVgpr = scheduler.getNumVgpr(tiA, tiB, scaleTiA, scaleTiB) + if vgprUsed + numVgpr <= vgprBudget: + break + + + # Allocation and instruction emit + scheduler.allocVgprTiles(writer, tiA, tiB, + scaleTileInfoA=scaleTiA, scaleTileInfoB=scaleTiB) + dtileInfo = writer.states.d.tileInfo + scheduler.populate_instructions( + writer, kernel, + tileInfoA=tiA, tileInfoB=tiB, dtileInfo=dtileInfo, + scaleTileInfoA=scaleTiA, scaleTileInfoB=scaleTiB) + + module.add(scheduler.emitAllLoops(writer, kernel)) + scheduler.deallocVgprTiles(writer) + + else: + # PGR=0: non-pipelined + module.addComment0("MAINLOOP") + module.add(mainLoopImplPGR0(writer, kernel)) + module.addComment("") + + return module diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedLogicalScheduler.py b/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedLogicalScheduler.py new file mode 100644 index 00000000000..770b07bfc33 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedLogicalScheduler.py @@ -0,0 +1,2415 @@ +"""MFMATile-based logical scheduler. + +Builds a logical schedule using MFMA tile indices as the core primitive, +with explicit per-operation load granularity for GR/LR on A, B, SA, SB. + +The schedule is built in these passes: + place_LRs — place LRs based on their granularities + assign_vgpr_tiles — assign physical vgprTileIds with per-tensor free-lists + place_GRs — place GRs + annotate_deps — annotate raw per-op dependencies + remove_unnecessary_gr_deps — remove redundant LR→GR deps + remove_unnecessary_lr_deps — remove redundant GR→LR deps covered by MFMA syncs + remove_cross_deps — replace cross-subIterK deps with wait preOps + insert_gr_lr_inc — insert lr_inc/gr_inc preOps at MT transitions + group — serialize and group (produce paths for instructionSchedule) + remove_wait_lr_sync — remove redundant wait_lr_sync after grouping + emit — produce List[EmittedModule] with before-link chains + + TODO: add a pass to remove redundant wait_gr_sync on multi-partition configs +""" + +from __future__ import annotations +from dataclasses import dataclass, field +from enum import IntEnum +from typing import Dict, List, Optional, Tuple, Union +import copy +import io +import math + + +class Pass(IntEnum): + """Scheduler passes in dependency order. + + The numeric value defines topological order. The main pipeline is linear + (each pass depends on the previous), except VGPR_TILES which forks off + LR independently of GR. + """ + LR = 0 + VGPR_TILES = 1 + GR = 2 + DEPS = 3 + REMOVE_GR_DEPS = 4 + REMOVE_LR_DEPS = 5 + REMOVE_DEPS = 6 + GR_INC = 7 + GROUP_LR_GR = 8 + REMOVE_WAIT_LR_SYNC = 9 + EMIT = 10 + BUILD = 11 + POPULATE = 12 + + +_PASS_PIPELINE = { + Pass.LR: ('place_LRs', []), + Pass.VGPR_TILES: ('assign_vgpr_tiles', [Pass.LR]), + Pass.GR: ('place_GRs', [Pass.LR]), + Pass.DEPS: ('annotate_deps', [Pass.GR]), + Pass.REMOVE_GR_DEPS: ('remove_unnecessary_gr_deps', [Pass.DEPS]), + Pass.REMOVE_LR_DEPS: ('remove_unnecessary_lr_deps', [Pass.REMOVE_GR_DEPS]), + Pass.REMOVE_DEPS: ('remove_cross_deps', [Pass.REMOVE_LR_DEPS]), + Pass.GR_INC: ('insert_gr_lr_inc', [Pass.REMOVE_DEPS]), + Pass.GROUP_LR_GR: ('group_lr_gr', [Pass.GR_INC]), + Pass.REMOVE_WAIT_LR_SYNC: ('remove_unnecessary_wait_lr_sync', [Pass.GROUP_LR_GR]), + Pass.EMIT: ('emit', [Pass.REMOVE_WAIT_LR_SYNC]), + Pass.BUILD: ('build', [Pass.EMIT]), + Pass.POPULATE: ('populate_instructions', []), +} + + +TENSOR_SIDE = {'A': 'A', 'B': 'B', 'SA': 'A', 'SB': 'B'} + +def fmt_mt(mt: int) -> str: + """Format MT iteration integer as display string: 0 → 'n', 1 → 'n+1', 2 → 'n+2'.""" + return "n" if mt == 0 else f"n+{mt}" + +# ── Core primitives ───────────────────────────────────────── + +@dataclass +class MFMATileRange: + """A rectangular range of MFMA tile coordinates for one read.""" + subIterK_start: int + subIterK_end: int # exclusive + tileId_start: int + tileId_end: int # exclusive + + @property + def subIterK_list(self) -> List[int]: + return list(range(self.subIterK_start, self.subIterK_end)) + + @property + def tileId_list(self) -> List[int]: + return list(range(self.tileId_start, self.tileId_end)) + + def fmt_k(self) -> str: + ids = self.subIterK_list + if len(ids) == 1: + return f"[{ids[0]}]" + return f"[{ids[0]},{ids[-1]}]" + + def fmt_tiles(self) -> str: + return f"[{self.tileId_start}-{self.tileId_end - 1}]" + + +# ── Config ────────────────────────────────────────────────── + +@dataclass +class ReadGranularity: + """Load granularity for one operation on one tensor, measured in MFMA tiles. + + mn: how many MFMA tiles in the M (for A/SA) or N (for B/SB) dimension + k: how many subIterK steps one read covers + """ + mn: int + k: int + + +@dataclass +class SchedulerConfig: + """Configuration for the MFMATile-based scheduler.""" + numMFMATilesM: int # MFMA tiles in M dimension (for A) + numMFMATilesN: int # MFMA tiles in N dimension (for B) + numSubIterK: int # subIterK steps within the macrotile + lrA: ReadGranularity + lrB: ReadGranularity + grA: ReadGranularity + grB: ReadGranularity + lrSA: Optional[ReadGranularity] = None + lrSB: Optional[ReadGranularity] = None + grSA: Optional[ReadGranularity] = None + grSB: Optional[ReadGranularity] = None + numPartitionsM: int = 1 # partition grid in M dimension + numPartitionsN: int = 1 # partition grid in N dimension + + @property + def hasScale(self) -> bool: + return self.lrSA is not None and self.lrSB is not None + + @property + def numPartitions(self) -> int: + return self.numPartitionsM * self.numPartitionsN + + @property + def partitionSizeM(self) -> int: + assert self.numMFMATilesM % self.numPartitionsM == 0 + return self.numMFMATilesM // self.numPartitionsM + + @property + def partitionSizeN(self) -> int: + assert self.numMFMATilesN % self.numPartitionsN == 0 + return self.numMFMATilesN // self.numPartitionsN + + @staticmethod + def get_partition_candidates(tileInfoA, tileInfoB) -> list: + """Return partition candidates as [(numPartitionsM, numPartitionsN), ...]. + + Enumerates all divisors of MAX(M, N) in ascending order and + partitions the larger dimension. Starts with (1, 1). + This will only produces 1xN or Nx1 partitions to allow VGPR pressure reduction. + """ + M = tileInfoA.localMMATileGrid[0] + N = tileInfoB.localMMATileGrid[0] + maxDim = max(M, N) + + divisors = sorted(d for d in range(1, maxDim + 1) if maxDim % d == 0) + + candidates = [] + for d in divisors: + if N >= M: + candidates.append((1, d)) + else: + candidates.append((d, 1)) + + return candidates + + + +# ── Schedule operation types ──────────────────────────────── + +@dataclass +class Emittable: + """Base for anything placed in an EmittedModule.""" + kind: str = field(init=False, default="") + + +@dataclass +class MFMAPlacement(Emittable): + """MFMA operation consuming data for one subIterK.""" + subIterK: int + tileA: MFMATileRange # A tiles consumed + tileB: MFMATileRange # B tiles consumed + deps: List['Dep'] = field(default_factory=list) # populated by annotate_deps() + preOps: List['BaseOp'] = field(default_factory=list) # populated by remove_cross_deps() + vgpr_tile_maps: Dict[str, List[dict]] = field(default_factory=dict) # {tensor: [{groupIdx: vgprTileId}]} per unroll iter + + def __post_init__(self): + self.kind = 'mfma' + + def __str__(self): + return (f"MFMAs (MT n, subIterK {self.subIterK} ) " + f"A : {self.tileA.fmt_tiles()} , B : {self.tileB.fmt_tiles()}") + + +@dataclass +class LRPlacement(Emittable): + """Local Read placement for one tensor in one subIterK slot.""" + tensor: str # 'A', 'B', 'SA', 'SB' + mtIteration: int # 0 = current MT, 1 = next MT + tiles: MFMATileRange + subIterK_slot: int # which subIterK this LR is placed in + partition: int = 0 # which partition this LR belongs to + deps: List['Dep'] = field(default_factory=list) # populated by annotate_deps() + preOps: List['BaseOp'] = field(default_factory=list) # populated by remove_cross_deps() + vgpr_tile_map: List[dict] = field(default_factory=list) # [{tileId: vgprTileId}] per unroll iter + + def __post_init__(self): + self.kind = 'lr' + + def __str__(self): + return (f"LR {self.tensor.ljust(2)} (MT {fmt_mt(self.mtIteration)}, " + f"subIterK {self.tiles.fmt_k()}) {self.tiles.fmt_tiles()}") + + +@dataclass +class GRPlacement(Emittable): + """Global Read placement for one tensor in one subIterK slot.""" + tensor: str # 'A', 'B', 'SA', 'SB' + mtIteration: int # 1 = next MT, 2 = two MTs ahead + tiles: MFMATileRange + subIterK_slot: int # which subIterK this GR is placed in + partition: int = 0 # which partition this GR belongs to + deps: List['Dep'] = field(default_factory=list) # populated by annotate_deps() + preOps: List['BaseOp'] = field(default_factory=list) # populated by remove_cross_deps() + + def __post_init__(self): + self.kind = 'gr' + + def __str__(self): + return (f"GR {self.tensor} (MT {fmt_mt(self.mtIteration)}, " + f"subIterK {self.tiles.fmt_k()}) ids {self.tiles.fmt_tiles()}") + + +# ── Per-subIterK container ────────────────────────────────── + +@dataclass +class SubIterKSlot: + """All operations placed in one subIterK step.""" + subIterK: int + mfma: Optional[MFMAPlacement] = None + lrs: List[LRPlacement] = field(default_factory=list) + grs: List[GRPlacement] = field(default_factory=list) + + +# ── Dependency types ──────────────────────────────────────── + +@dataclass +class WaitGRCounts: + """Per-tensor inflight load counts for wait_gr preOp.""" + A: int = 0 + B: int = 0 + SA: int = 0 + SB: int = 0 + + def __str__(self): + parts = [] + for t in ('A', 'B', 'SA', 'SB'): + v = getattr(self, t) + if v: + parts.append(f"{t}={v}") + return ",".join(parts) if parts else "0" + + +@dataclass +class BaseOp(Emittable): + """Base class for typed dependency operations in a before-chain.""" + + def __str__(self): + return self.kind + + +@dataclass +class WaitGROp(BaseOp): + """Wait for global reads to complete. Optionally includes a sync barrier.""" + wait_gr_counts: Optional[WaitGRCounts] = None + has_sync: bool = False + + def __post_init__(self): + self.kind = 'wait_gr' + + def __str__(self): + if self.wait_gr_counts: + return f"{self.kind}({self.wait_gr_counts})" + return self.kind + + +@dataclass +class WaitLROp(BaseOp): + """Wait for local reads to complete. Optionally includes a sync barrier.""" + has_sync: bool = False + + def __post_init__(self): + self.kind = 'wait_lr' + + +@dataclass +class SyncOp(BaseOp): + """Standalone sync barrier.""" + def __post_init__(self): + self.kind = 'sync' + + +@dataclass +class LRIncOp(BaseOp): + """LDS buffer swap for local reads on a specific tensor.""" + tensor: str = "" + + def __post_init__(self): + self.kind = 'lr_inc' + + def __str__(self): + return f"lr_inc({self.tensor})" + + +@dataclass +class GRIncOp(BaseOp): + """Pointer update + LDS swap for global reads on a specific tensor.""" + tensor: str = "" + + def __post_init__(self): + self.kind = 'gr_inc' + + def __str__(self): + return f"gr_inc({self.tensor})" + + +@dataclass +class SkipOp(BaseOp): + """Skip guard: compare LoopCounter and branch.""" + compare: str = "" + value: int = 0 + target: str = "" + + def __post_init__(self): + self.kind = 'skip' + + @property + def tensor(self) -> str: + return f"{self.compare}:{self.value}:{self.target}" + + def __str__(self): + return f"skip({self.tensor})" + + +@dataclass +class Dep: + """Dependency on another placement (annotate_deps output).""" + ref: Union[LRPlacement, GRPlacement] + mt_offset: int = 0 # 0 = same MT, -1 = prev MT, -2 = two MTs back, ... + + + + +# ── Emitted output ───────────────────────────────────────── + +@dataclass +class EmittedModule: + """One emitted module with before-link for instruction scheduling. + + Compatible with SubtileBasedInstructionScheduler.instructionSchedule(). + Instructions are left empty at the logical level — filled during emission. + """ + moduleId: int = -1 + instructions: list = field(default_factory=list) + before: Optional[int] = None # moduleId that must complete before this module + source: Optional[Emittable] = None + + @property + def opType(self) -> str: + return self.source.kind if self.source else "" + + +# ── Main scheduler class ─────────────────────────────────── + +class LogicalScheduler: + """Subtile-based logical scheduler. + + Builds the schedule in 6 passes, each producing testable intermediate output. + Each pass auto-runs its prerequisites if needed (tracked via self._completed). + """ + + def __init__(self, config: SchedulerConfig): + self.config = config + self.tensors: List[str] = ['A', 'B'] + (['SA', 'SB'] if config.hasScale else []) + self._completed: set = set() # tracks which passes have run (Pass enum members) + self._partitions: Optional[List[List[SubIterKSlot]]] = None # shared mutable state across passes + self._emitted: Optional[List[List[EmittedModule]]] = None + self._preloop_emitted: Optional[List[List[List[EmittedModule]]]] = None + self._ngll_emitted: Optional[List[List[List[EmittedModule]]]] = None + self._nll_emitted: Optional[List[List[List[EmittedModule]]]] = None + + def _ensure_pass(self, *prerequisites: Pass) -> None: + for p in prerequisites: + if p not in self._completed: + getattr(self, _PASS_PIPELINE[p][0])() + + # ── Place LRs ───────────────────────────────────────── + + def _partition_tile_range(self, pi: int) -> dict: + """Return {'A': (start, end), 'B': (start, end)} for partition pi. + + Uses COLUMN_MAJOR ordering: M (A) varies fastest, N (B) varies slowest. + """ + cfg = self.config + # COLUMN_MAJOR: M is inner (pi % M), N is outer (pi // M) + piM = pi % cfg.numPartitionsM + piN = pi // cfg.numPartitionsM + a0 = piM * cfg.partitionSizeM + b0 = piN * cfg.partitionSizeN + return {'A': (a0, a0 + cfg.partitionSizeM), + 'B': (b0, b0 + cfg.partitionSizeN)} + + def place_LRs(self) -> List[List[SubIterKSlot]]: + """Place MFMAs and LRs based on read granularities. + + Returns a list of partitions, each containing a list of SubIterKSlots. + + Each LR prefetches data for the next subIterK group. Within-partition + prefetches use current partition tiles; cross-partition prefetches + (wrapping) use next partition tiles. + + Two tracking mechanisms: + - loaded_ranges: tracks tile ranges in VGPR per side. Wrapping LRs + are only placed when the next partition's tiles aren't already loaded. + - placed: tracks (tensor, k-range, tile-range) of non-wrapping LRs + placed so far across partitions. Skips redundant K-prefetch when + the same data was already loaded by an earlier partition. + """ + cfg = self.config + numP = cfg.numPartitions + part_ranges = [self._partition_tile_range(pi) for pi in range(numP)] + + # Track which tile ranges are currently loaded in VGPR (for wrapping decisions). + loaded_ranges = {'A': {part_ranges[0]['A']}, + 'B': {part_ranges[0]['B']}} + + # Track placed K-prefetch LRs across partitions (for dedup). + placed = set() + + partitions = [] + for pi in range(numP): + cur, nxt = part_ranges[pi], part_ranges[(pi + 1) % numP] + is_last = (pi == numP - 1) + + load = {} + for side in ('A', 'B'): + load[side] = is_last or nxt[side] not in loaded_ranges[side] + + slots = self._place_LRs_for_partition(cur, nxt, is_last, load, placed) + for slot in slots: + for lr in slot.lrs: + lr.partition = pi + partitions.append(slots) + + for side in ('A', 'B'): + if load[side]: + loaded_ranges[side] = {cur[side], nxt[side]} + + self._partitions = partitions + self._completed.add(Pass.LR) + return partitions + + def _place_LRs_for_partition(self, cur: tuple, nxt: tuple, + is_last: bool, + load: dict, + placed: set) -> List[SubIterKSlot]: + """Place MFMAs and LRs for one partition.""" + cfg = self.config + numK = cfg.numSubIterK + multi_part = cfg.numPartitions > 1 + + slots = [SubIterKSlot(subIterK=k) for k in range(numK)] + slot_mt = {} # slot_k → lr_mt string, for MT-homogeneity enforcement + + # MFMAs + for k in range(numK): + slots[k].mfma = MFMAPlacement( + subIterK=k, + tileA=MFMATileRange(k, k + 1, cur['A'][0], cur['A'][1]), + tileB=MFMATileRange(k, k + 1, cur['B'][0], cur['B'][1]), + ) + + # All tensors that can participate. + all_tensors = [('A', cfg.lrA), ('B', cfg.lrB)] + if cfg.hasScale: + all_tensors.append(('SA', cfg.lrSA)) + all_tensors.append(('SB', cfg.lrSB)) + + # Place LRs grouped by k_gran. + # - Non-wrapping (K-prefetch): all tensors, deduped by placed set. + # - Wrapping (cross-partition): only tensors whose side needs loading. + for k_gran in sorted(set(g.k for _, g in all_tensors)): + group_all = [(t, g) for t, g in all_tensors if g.k == k_gran] + num_chunks = numK // k_gran + for chunk_idx in range(num_chunks): + next_chunk = (chunk_idx + 1) % num_chunks + is_wrap = (next_chunk == 0) + lr_mt = 1 if is_last and is_wrap else 0 + lr_k_start = next_chunk * k_gran + lr_k_end = lr_k_start + k_gran + base_slot = chunk_idx * k_gran + + # For wrapping chunks, only include tensors whose side is + # loading so that slot assignment reflects active tensors. + # A and B always participate (their wrapping is gated inside + # the loop) to keep slot indices stable for their k_gran group. + if is_wrap and multi_part: + group = [(t, g) for t, g in group_all + if t in ('A', 'B') or load['A' if t in ('A', 'SA') else 'B']] + else: + group = group_all + + # Group by side (A/SA together, B/SB together) for slot assignment + sides = [[(t, g) for t, g in group if t in ('A', 'SA')], + [(t, g) for t, g in group if t in ('B', 'SB')]] + sides = [s for s in sides if s] + + for side_idx, side in enumerate(sides): + slot_k = base_slot + (side_idx % k_gran) + # Redirect LRs away from slots committed to a different MT, + # keeping each slot MT-homogeneous. + # This reduce the number of wait_gr_sync needed as all LRs + # in the same subIterK wait for the same MT iterration. + committed = slot_mt.get(slot_k) + if committed is not None and committed != lr_mt: + slot_k = numK - 1 + + for tensor, gran in side: + tile_range = nxt if (is_wrap or not multi_part) else cur + side_key = 'A' if tensor in ('A', 'SA') else 'B' + ts, te = tile_range[side_key] + + # Wrapping: use load dict. Non-wrapping: use placed set. + if is_wrap and multi_part: + if not load[side_key]: + continue + else: + lr_key = (tensor, lr_k_start, lr_k_end, ts, te) + if lr_key in placed: + continue + placed.add(lr_key) + + lr = LRPlacement( + tensor=tensor, + mtIteration=lr_mt, + tiles=MFMATileRange(lr_k_start, lr_k_end, ts, te), + subIterK_slot=slot_k, + ) + slots[slot_k].lrs.append(lr) + slot_mt[slot_k] = lr_mt + + return slots + + # ── Assign VGPR tile IDs (free-list allocation) ────── + + def assign_vgpr_tiles(self): + """Assign physical vgprTileIds to all placements (A, B, SA, SB). + + Free-list allocator with per-tensor FIFO queues, iterated until + convergence (or max 4 unroll iterations). + + Three phases: + 1. Scan all MFMAs to find last read position for each + (tensor, tileId, k_data_group) key. + 2. Walk execution order in a loop: each iteration feeds the + previous next_iter as the starting active state. Appends + one tile-map dict per iteration to each placement's list. + Stops when next_iter matches the seeded state (convergence). + 3. Record unroll_factor, needs_unrolling, and max tile_peaks. + + Keys use a unified formula parameterized by ReadGranularity: + key = (tensor, (tileId // lr_gran.mn) * lr_gran.mn, (k // lr_gran.k) * lr_gran.k) + + Sets self.tile_peaks (per-tensor max across unrolls), + self.needs_unrolling, self.unroll_factor. + """ + self._ensure_pass(Pass.LR) + + cfg = self.config + numK = cfg.numSubIterK + MAX_UNROLL = 8 + + lr_grans = {'A': cfg.lrA, 'B': cfg.lrB} + if cfg.hasScale: + lr_grans['SA'] = cfg.lrSA + lr_grans['SB'] = cfg.lrSB + + # ── Phase 1: find last MFMA read for each key ── + last_read = {} # key -> flat position + for pi, slots in enumerate(self._partitions): + for slot in slots: + if not slot.mfma: + continue + pos = pi * numK + slot.subIterK + k = slot.subIterK + for tensor in self.tensors: + side = TENSOR_SIDE[tensor] + tileRange = slot.mfma.tileA if side == 'A' else slot.mfma.tileB + gran = lr_grans[tensor] + for t in tileRange.tileId_list: + group = (t // gran.mn) * gran.mn + k_chunk = (k // gran.k) * gran.k + last_read[(tensor, group, k_chunk)] = pos + + # ── Phase 2: iterate until convergence ── + from collections import deque + + class _FreeList: + __slots__ = ('free', 'next_id', 'active_count', 'peak') + def __init__(self): + self.free = deque() + self.next_id = 0 + self.active_count = 0 + self.peak = 0 + def alloc(self): + if self.free: + vid = self.free.popleft() # FIFO for convergence + else: + vid = self.next_id + self.next_id += 1 + self.active_count += 1 + self.peak = max(self.peak, self.active_count) + return vid + def release(self, vid): + self.free.append(vid) + self.active_count -= 1 + + max_peaks = {t: 0 for t in self.tensors} + carry_active = {} + all_next_iters = [] # next_iter from each iteration, for cycle detection + + pools = {t: _FreeList() for t in self.tensors} + + for unroll_iter in range(MAX_UNROLL): + if unroll_iter == 0: + active = {} + else: + active = dict(carry_active) + # Reset active_count to match carry_active (tiles that survived + # as live from the previous iteration's wrapping LRs). + for t in self.tensors: + pools[t].active_count = sum( + 1 for key in active if key[0] == t) + + next_iter = {} + + for pi, slots in enumerate(self._partitions): + for slot in slots: + pos = pi * numK + slot.subIterK + k = slot.subIterK + + # ── MFMA reads: look up or seed ── + if slot.mfma: + for tensor in self.tensors: + side = TENSOR_SIDE[tensor] + tileRange = slot.mfma.tileA if side == 'A' else slot.mfma.tileB + gran = lr_grans[tensor] + tile_map = {} + for t in tileRange.tileId_list: + group = (t // gran.mn) * gran.mn + k_chunk = (k // gran.k) * gran.k + key = (tensor, group, k_chunk) + if key not in active: + active[key] = pools[tensor].alloc() + tile_map[group] = active[key] + slot.mfma.vgpr_tile_maps.setdefault(tensor, []).append(tile_map) + + # ── LR writes: allocate new tiles ── + for lr in slot.lrs: + tensor = lr.tensor + is_wrapping = lr.mtIteration != 0 + target = next_iter if is_wrapping else active + + gran = lr_grans[tensor] + tile_map = {} + seen_keys = set() + for t in lr.tiles.tileId_list: + group = (t // gran.mn) * gran.mn + for lk in lr.tiles.subIterK_list: + k_chunk = (lk // gran.k) * gran.k + key = (tensor, group, k_chunk) + if key in seen_keys: + continue + seen_keys.add(key) + if key in target: + pools[tensor].release(target[key]) + vid = pools[tensor].alloc() + target[key] = vid + tile_map[group] = vid + lr.vgpr_tile_map.append(tile_map) + + # ── Release tiles whose last read was at this position ── + to_release = [key for key, lr_pos in last_read.items() + if lr_pos == pos and key in active] + for key in to_release: + pools[key[0]].release(active[key]) + del active[key] + + # Track max peaks across iterations + for t in self.tensors: + max_peaks[t] = max(max_peaks[t], pools[t].peak) + + # Check convergence: if this iteration's next_iter matches + # any previous iteration's next_iter, we found a cycle. + # The cycle period is (current_iter - matching_iter). + # All iterations from matching_iter to current_iter-1 form + # the repeating pattern; iterations before that are prologue. + converged = False + for prev_idx, prev_ni in enumerate(all_next_iters): + if next_iter == prev_ni: + # Strip tile maps from the redundant convergence iteration. + for pi2, slots2 in enumerate(self._partitions): + for slot2 in slots2: + if slot2.mfma: + for tensor in self.tensors: + if tensor in slot2.mfma.vgpr_tile_maps: + slot2.mfma.vgpr_tile_maps[tensor].pop() + for lr2 in slot2.lrs: + lr2.vgpr_tile_map.pop() + converged = True + break + if converged: + break + + # Carry next_iter forward as active for next iteration + all_next_iters.append(next_iter) + carry_active = next_iter + else: + assert False, (f"assign_vgpr_tiles did not converge after " + f"{MAX_UNROLL} unroll iterations") + + # ── Phase 3: record results ── + # unroll_factor = number of unique iterations (convergence iteration excluded) + self.unroll_factor = unroll_iter + self.needs_unrolling = self.unroll_factor > 1 + self.tile_peaks = max_peaks + + self._completed.add(Pass.VGPR_TILES) + + # ── Place GRs ───────────────────────────────────────── + + def _build_gr_list(self, part_ranges, offsetMT, offsetPartition, + debug=False): + """Phase 1: Build ordered GR list from placed MFMAs. + + For each partition × subIterK, derive target partition/MT from + the MFMA and offsets. Add GRs (A, B, SA, SB) with tile and K + ranges snapped to GR granularity. Dedup within same MT level, + then remove n+1 entries that also appear at n+2 (cross-MT dedup). + + For each subIterK, we apply offsetMT on MT and offsetPartition on partition. + + Returns list of (tensor, mt_str, tile_start, tile_end, + k_start, k_end, gr_gran). + """ + cfg = self.config + numP = cfg.numPartitions + + seen = set() + gr_list = [] + + for pi in range(numP): + partition_slots = self._partitions[pi] + + target_pi = (pi + offsetPartition) % numP + wraps = (pi + offsetPartition) >= numP + mt_val = offsetMT + (1 if wraps else 0) + + target_range = part_ranges[target_pi] + + for slot in partition_slots: + k = slot.mfma.subIterK + + items = [('A', target_range['A'], cfg.grA), + ('B', target_range['B'], cfg.grB)] + if cfg.hasScale: + items.append(('SA', target_range['A'], cfg.grSA)) + items.append(('SB', target_range['B'], cfg.grSB)) + + for tensor, (t_start, t_end), gr_gran in items: + mn = gr_gran.mn + k_gran = gr_gran.k + + gr_tile_start = (t_start // mn) * mn + gr_tile_end = ((t_end + mn - 1) // mn) * mn + + gr_k_start = (k // k_gran) * k_gran + gr_k_end = gr_k_start + k_gran + + key = (tensor, mt_val, gr_tile_start, gr_tile_end, + gr_k_start, gr_k_end) + if key in seen: + continue + seen.add(key) + gr_list.append((tensor, mt_val, gr_tile_start, + gr_tile_end, gr_k_start, gr_k_end, + gr_gran)) + + # Cross-MT dedup: if a tile/k range appears at both n+1 and n+2, + # the n+1 load is redundant — the previous iteration's n+2 already + # wrote the same data into LDS. Remove the n+1 duplicate. + base_mt = offsetMT + n2_keys = {(t, ts, te, ks, ke) + for t, mt, ts, te, ks, ke, _ in gr_list + if mt != base_mt} + gr_list = [entry for entry in gr_list + if entry[1] != base_mt or + (entry[0], entry[2], entry[3], entry[4], entry[5]) + not in n2_keys] + + if debug: + print(f"Phase 1: {len(gr_list)} GR entries") + for i, (t, mt, ts, te, ks, ke, g) in enumerate(gr_list): + loads = ((te - ts) // g.mn) * ((ke - ks) // g.k) + print(f" [{i}] {t:2s} {fmt_mt(mt)} tiles[{ts},{te - 1}] k[{ks},{ke - 1}] " + f"gr_gran(mn={g.mn},k={g.k}) loads={loads}") + + return gr_list + + def _build_lr_conflict_map(self): + """Build per-partition LR(MT n) info for LDS conflict checking. + + Returns dict: (partition_idx, tensor) -> list of + (subIterK_slot, k_start, k_end). + """ + lr_mt_n_info = {} + for pi, partition_slots in enumerate(self._partitions): + for slot in partition_slots: + for lr in slot.lrs: + if lr.mtIteration == 0: + lr_mt_n_info.setdefault((pi, lr.tensor), []).append( + (slot.subIterK, + lr.tiles.subIterK_start, + lr.tiles.subIterK_end)) + return lr_mt_n_info + + @staticmethod + def _has_lr_conflict(lr_mt_n_info, tensor, mt_val, pi, subIterK, + gr_k_start, gr_k_end): + """Return True if placing GR(mt_val) at (pi, subIterK) conflicts. + + GR(MT n+2) writes the same LDS buffer as MT n, so it conflicts + only if a later LR(MT n) in the same partition accesses an + overlapping subIterK range. + """ + if mt_val != 2: + return False + for lr_slot, lr_ks, lr_ke in lr_mt_n_info.get((pi, tensor), []): + if lr_slot > subIterK and gr_k_start < lr_ke and lr_ks < gr_k_end: + return True + return False + + def _distribute_grs(self, gr_list, lr_mt_n_info, debug=False): + """Phase 2: Distribute GR atoms across partition × subIterK slots. + + Explodes GR entries into atomic loads, distributes them into flat + buckets respecting LDS conflict constraints and load balance, + then remerges consecutive atoms and places them into partitions. + """ + cfg = self.config + numK = cfg.numSubIterK + numP = cfg.numPartitions + numSlots = numP * numK + + # 2a. Explode GR entries into atomic loads (1 load each) + atoms = [] + for tensor, mt_val, t_start, t_end, k_start, k_end, gr_gran in gr_list: + mn = gr_gran.mn + for pos in range(t_start, t_end, mn): + atoms.append((tensor, mt_val, pos, pos + mn, k_start, k_end)) + + loads_per_slot = len(atoms) // numSlots + + # 2b. Distribute atoms into flat buckets [0..numSlots), + # each bucket maps to (partition=flat//numK, subIterK=flat%numK) + buckets = [[] for _ in range(numSlots)] + for atom in atoms: + tensor, mt_val, _, _, ks, ke = atom + cur = 0 + while cur < numSlots - 1: + pi = cur // numK + subK = cur % numK + if (not self._has_lr_conflict(lr_mt_n_info, tensor, mt_val, + pi, subK, ks, ke) and + len(buckets[cur]) < loads_per_slot): + break + cur += 1 + buckets[cur].append(atom) + + if debug: + print(f"Phase 2b: {len(atoms)} atoms, {numSlots} slots, " + f"{loads_per_slot} per slot") + for flat, bucket in enumerate(buckets): + pi = flat // numK + si = flat % numK + if bucket: + items = ", ".join( + f"{t} {fmt_mt(mt)} tile[{ts},{te-1}] k[{ks},{ke-1}]" + for t, mt, ts, te, ks, ke in bucket) + print(f" P{pi} s{si}: {len(bucket)} atoms — {items}") + else: + print(f" P{pi} s{si}: empty") + + # 2c. Remerge consecutive atoms and place into partitions + for flat, bucket in enumerate(buckets): + pi = flat // numK + si = flat % numK + target_slot = self._partitions[pi][si] + for atom in bucket: + tensor, mt_val, ts, te, ks, ke = atom + if target_slot.grs: + prev = target_slot.grs[-1] + if (prev.tensor == tensor and + prev.mtIteration == mt_val and + prev.tiles.subIterK_start == ks and + prev.tiles.subIterK_end == ke and + prev.tiles.tileId_end == ts): + prev.tiles = MFMATileRange(ks, ke, prev.tiles.tileId_start, te) + continue + target_slot.grs.append(GRPlacement( + tensor=tensor, mtIteration=mt_val, + tiles=MFMATileRange(ks, ke, ts, te), + subIterK_slot=si, + partition=pi)) + + def place_GRs(self) -> List[SubIterKSlot]: + """Place Global Reads by iterating MFMAs across partitions. + + Phase 1: Build ordered GR list from partition traversal respecting gr granularities. + Phase 2: Distribute evenly GR atoms across all (partition, subIterK) slots. GR atoms being the smallest load granularity for a specific tensor. + + This should give a sheduling respecting the following rules: + - GR are in the order we expect them from the LR pov + - we respect the GR granularities (can change the above rule a bit) + - Overall loads are spread accross all subIterKs of all partitions. + + """ + self._ensure_pass(Pass.LR) + + part_ranges = [self._partition_tile_range(pi) + for pi in range(self.config.numPartitions)] + + # TODO: cover PGR3 (offsetMT and offsetPartition may differ) + offsetMT = 1 + offsetPartition = 1 + # Build ordered list of GRs to place for the entire MT based on the partitioning ordering and the GR granularities. + gr_list = self._build_gr_list(part_ranges, offsetMT, offsetPartition) + # Map to keep track of LR(MT n) for each partiion and tensor, used for LDS double buffer conflict checking when placing GRs. + lr_mt_n_info = self._build_lr_conflict_map() + # Distribute GRs accross partition. + self._distribute_grs(gr_list, lr_mt_n_info) + + self._completed.add(Pass.GR) + return self._partitions[0] + + # ── Annotate dependencies ───────────────────────────── + + def annotate_deps(self): + """Annotate each placement with its raw before-dependencies. + + Populates the `before` field on MFMAPlacement, LRPlacement, and + GRPlacement objects in self._partitions. Each lr_ref/gr_ref BaseOp + is resolved to point at the specific placement it depends on. + + Iterates all partitions. Two-pass per partition: + - Pass 1: build lookups from existing placements + - Pass 2: populate .before on each placement + + Rules: + - MFMA(subIterK=k) depends on all LRs that loaded subIterK=k data + (cross-partition: LRs for a tensor may be in any partition) + - LR depends on GR for same tensor (data must be in LDS) + - GR depends on collision LR for same tensor (LDS double-buffer) + """ + self._ensure_pass(Pass.GR) + cfg = self.config + numK = cfg.numSubIterK + + # Build global lr_by_data across all partitions (MFMA deps are cross-partition) + # lr_by_data[data_k][tensor] → list of LRPlacements loading subIterK=data_k + lr_by_data = [{} for _ in range(numK)] + # gr_by_tensor[tensor] → list of all GRPlacements (LR→GR deps are cross-partition) + gr_by_tensor = {} + # lr_by_tensor[tensor] → list of all LRPlacements (GR→LR collision is cross-partition) + lr_by_tensor = {} + for slots in self._partitions: + for slot in slots: + for lr in slot.lrs: + for data_k in lr.tiles.subIterK_list: + lr_by_data[data_k].setdefault(lr.tensor, []).append(lr) + lr_by_tensor.setdefault(lr.tensor, []).append(lr) + for gr in slot.grs: + gr_by_tensor.setdefault(gr.tensor, []).append(gr) + + for pi, slots in enumerate(self._partitions): + self._annotate_deps_partition(pi, slots, cfg, lr_by_data, + gr_by_tensor, lr_by_tensor) + + self._completed.add(Pass.DEPS) + + def _annotate_deps_partition(self, pi: int, slots: List[SubIterKSlot], + cfg: SchedulerConfig, lr_by_data: list, + gr_by_tensor: dict, lr_by_tensor: dict): + """Annotate deps for a single partition (in-place on placements).""" + numK = len(slots) + + # Clear any previous annotations (idempotent re-runs) + for slot in slots: + if slot.mfma: + slot.mfma.deps.clear() + for lr in slot.lrs: + lr.deps.clear() + for gr in slot.grs: + gr.deps.clear() + + # ── Pass 1: build per-partition lookups ── + # lr_by_slot[k][tensor] → LRPlacement at subIterK=k + # gr_by_slot[k][tensor] → GRPlacement at subIterK=k + # (lr_by_data, gr_by_tensor, lr_by_tensor are built globally in annotate_deps) + lr_by_slot = [{} for _ in range(numK)] + gr_by_slot = [{} for _ in range(numK)] + + for k, slot in enumerate(slots): + for lr in slot.lrs: + lr_by_slot[k][lr.tensor] = lr + + for gr in slot.grs: + gr_by_slot[k][gr.tensor] = gr + + # ── Pass 2: populate deps on each placement ── + # mt_offset: 0 = same MT, -1 = prev MT, -2 = two MTs back, etc. + # Within one iteration, execution order per slot is MFMA → LR → GR, + # and slots run in order 0, 1, 2, ... + _order = {'MFMA': 0, 'LR': 1, 'GR': 2} + + def _slot_offset(consumer_partition, consumer_slot, consumer_type, producer): + """Offset from partition+slot ordering: 0 if producer ran first, -1 otherwise.""" + prod_partition = producer.partition + if prod_partition < consumer_partition: + return 0 + if prod_partition > consumer_partition: + return -1 + prod_slot = producer.subIterK_slot + if prod_slot < consumer_slot: + return 0 + if prod_slot > consumer_slot: + return -1 + prod_type = 'LR' if isinstance(producer, LRPlacement) else 'GR' + return -1 if _order[prod_type] >= _order[consumer_type] else 0 + + def _mt_offset(consumer_partition, consumer_slot, consumer_type, producer, consumer=None): + # MFMA→LR: MFMA always consumes mt=0 (current). + if consumer_type == 'MFMA' and isinstance(producer, LRPlacement): + if producer.mtIteration > 0: + return -producer.mtIteration + # LR→GR: mt difference determines how many iterations back. + if consumer_type == 'LR' and isinstance(producer, GRPlacement) and consumer: + diff = producer.mtIteration - consumer.mtIteration + if diff != 0: + return -diff + # Same effective mt: partition+slot ordering decides. + return _slot_offset(consumer_partition, consumer_slot, consumer_type, producer) + + def _tiles_overlap(mfma, lr_tensor, lr_tiles): + """Check if LR tile range overlaps with MFMA's tile range for that tensor.""" + # SA/SB follow A/B tile ranges respectively + if lr_tensor in ('A', 'SA'): + mfma_range = mfma.tileA + else: + mfma_range = mfma.tileB + return (lr_tiles.tileId_start < mfma_range.tileId_end and + lr_tiles.tileId_end > mfma_range.tileId_start) + + def _range_overlaps(a: MFMATileRange, b: MFMATileRange) -> bool: + """Check if two tile ranges overlap on both tile ids and subIterK.""" + return (a.tileId_start < b.tileId_end and + a.tileId_end > b.tileId_start and + a.subIterK_start < b.subIterK_end and + a.subIterK_end > b.subIterK_start) + + def _dedup_deps(deps): + if len(deps) <= 1: + return deps + def _exec_order(dep): + return (dep.mt_offset, dep.ref.partition, dep.ref.subIterK_slot) + return [max(deps, key=_exec_order)] + + for k, slot in enumerate(slots): + # MFMA: depends on the most recent LR per tensor (tile-overlapping). + # Uses lr_by_tensor (all LRs across partitions) so that a more recent + # LR loading a different subIterK still subsumes older data deps. + if slot.mfma: + for t in self.tensors: + deps_for_t = [] + for lr in lr_by_tensor.get(t, []): + if _tiles_overlap(slot.mfma, t, lr.tiles): + deps_for_t.append(Dep( + ref=lr, mt_offset=_mt_offset(pi, k, 'MFMA', lr))) + slot.mfma.deps.extend(_dedup_deps(deps_for_t)) + + # LR: depends on GR (data must be in LDS before reading) + # Cross-partition: the GR that loaded the matching tiles may be + # in a different partition. Filter by tile overlap. + for lr in slot.lrs: + for gr in gr_by_tensor.get(lr.tensor, []): + if _range_overlaps(lr.tiles, gr.tiles): + lr.deps.append(Dep( + ref=gr, mt_offset=_mt_offset(pi, k, 'LR', gr, consumer=lr))) + + # GR: depends on collision LR (LDS double-buffer) + # GR(n+x) collides with LR(n+x-2) — same buffer, period 2. + # target_data = gr.mtIteration - 2. For each LR of same tensor, + # mt_offset = target_data - lr.mtIteration. Dedup keeps latest. + # GR(2)→LR(0): mt_offset = 0 (same iteration) + # GR(2)→LR(1): mt_offset = -1 (prev iter LR(1) handled n) + # GR(1)→LR(0): mt_offset = -1 (prev iter LR(0) handled n-1) + for gr in slot.grs: + target_data = gr.mtIteration - 2 + for lr in lr_by_tensor.get(gr.tensor, []): + if _range_overlaps(lr.tiles, gr.tiles): + mt_off = target_data - lr.mtIteration + gr.deps.append(Dep(ref=lr, mt_offset=mt_off)) + if not gr.deps: + raise ValueError( + f"GR {gr.tensor} mt={fmt_mt(gr.mtIteration)} at slot {k} " + f"has no overlapping LR(n) dependency") + + for slot in slots: + for lr in slot.lrs: + lr.deps = _dedup_deps(lr.deps) + for gr in slot.grs: + gr.deps = _dedup_deps(gr.deps) + + # ── Remove unnecessary GR deps ──────────────────────── + + def remove_unnecessary_gr_deps(self): + """Remove GR deps on LRs that are already guaranteed by an earlier LR's wait. + + Per tensor, walks LR placements in execution order. If an earlier LR + already waits for a GR with equal or higher exec_order, the later LR's + dep is redundant and removed. + + Wraps around: the first LR's dep is compared against the last from the + previous MT iteration (max dep exec_order shifted by mt_offset -1). + """ + self._ensure_pass(Pass.DEPS) + + def _dep_exec_order(dep): + return (dep.mt_offset, dep.ref.partition, dep.ref.subIterK_slot) + + for tensor in self.tensors: + lr_with_gr_deps = [] + for pi, slots in enumerate(self._partitions): + for slot in slots: + for lr in slot.lrs: + if lr.tensor == tensor and lr.deps: + dep = lr.deps[0] + if isinstance(dep.ref, GRPlacement): + lr_with_gr_deps.append((lr, dep)) + + if len(lr_with_gr_deps) <= 1: + continue + + max_eo = max(_dep_exec_order(dep) for _, dep in lr_with_gr_deps) + max_guaranteed = (max_eo[0] - 1, max_eo[1], max_eo[2]) + + for lr, dep in lr_with_gr_deps: + eo = _dep_exec_order(dep) + if eo <= max_guaranteed: + lr.deps.clear() + else: + max_guaranteed = eo + + self._completed.add(Pass.REMOVE_GR_DEPS) + + # ── Remove unnecessary LR deps ──────────────────────── + + def remove_unnecessary_lr_deps(self): + """Remove GR→LR collision deps already covered by an earlier sync. + + A GR with an LR dep creates a sync point. + We get the latest LR guaranted at this sync point (prevLRDep), which is the max of: + - the GR's own LR dep + - the MFMA's same-tensor LR dep + + If the current GR's LR dep (currLRDep) has exec order <= prevLRDep, it is already guaranteed + and can be removed. + + Exec order is (mt_offset, partition, subIterK_slot). + On wrap-around the exec order is shifted by MT-1. + """ + self._ensure_pass(Pass.REMOVE_GR_DEPS) + + def _dep_exec_order(dep): + return (dep.mt_offset, dep.ref.partition, dep.ref.subIterK_slot) + + gr_with_lr_deps = [] + for pi, slots in enumerate(self._partitions): + for slot in slots: + for gr in slot.grs: + if gr.deps: + dep = gr.deps[0] + if isinstance(dep.ref, LRPlacement): + gr_with_lr_deps.append((pi, slot.subIterK, gr, dep)) + + if len(gr_with_lr_deps) <= 1: + self._completed.add(Pass.REMOVE_LR_DEPS) + return + + mfma_by_pos = {} + for pi, slots in enumerate(self._partitions): + for slot in slots: + if slot.mfma: + mfma_by_pos[(pi, slot.subIterK)] = slot.mfma + + gr_with_lr_deps.sort(key=lambda x: (x[0], x[1])) + + last_sync = (gr_with_lr_deps[-1][0], gr_with_lr_deps[-1][1]) + # Per-tensor max LR exec order guaranteed at last_sync. + last_sync_eo = {} + + def _update_sync_eo(pos, shift): + """Collect per-tensor max LR exec order at pos (MFMA deps).""" + eo_map = {} + mfma = mfma_by_pos.get(pos) + if mfma and mfma.deps: + for d in mfma.deps: + if isinstance(d.ref, LRPlacement): + t = d.ref.tensor + d_eo = _dep_exec_order(d) + if shift: + d_eo = (d_eo[0] - 1, d_eo[1], d_eo[2]) + if t not in eo_map or d_eo > eo_map[t]: + eo_map[t] = d_eo + return eo_map + + # Seed from last position (previous MT → shift by -1). + last_sync_eo = _update_sync_eo(last_sync, shift=True) + + for pi, subIterK, gr, dep in gr_with_lr_deps: + curr_eo = _dep_exec_order(dep) + tensor = dep.ref.tensor + + prev_lr_eo = last_sync_eo.get(tensor) + if prev_lr_eo is not None and curr_eo <= prev_lr_eo: + gr.deps.clear() + continue + + last_sync = (pi, subIterK) + last_sync_eo = _update_sync_eo(last_sync, shift=False) + # The GR's own dep is also a sync point at this slot. + if tensor not in last_sync_eo or curr_eo > last_sync_eo[tensor]: + last_sync_eo[tensor] = curr_eo + + self._completed.add(Pass.REMOVE_LR_DEPS) + + # ── Remove cross-subIterK deps ───────────────────────── + + def _gr_granularity(self, tensor: str) -> ReadGranularity: + """Return GR granularity for a tensor.""" + return {'A': self.config.grA, 'B': self.config.grB, + 'SA': self.config.grSA, 'SB': self.config.grSB}[tensor] + + def _compute_inflight_loads(self, consumer_pi: int, consumer_slot: int, + tensor: str, dep_ref: Dep) -> WaitGRCounts: + """Count inflight GR atomic loads between a dep GR and the consumer. + + Walks backward through the flattened schedule (all partitions x subIterK) + from the consumer position, counting atomic GR loads for all tensors. + Stops when reaching the dependency GR (dep_ref.ref) after accounting + for mt_offset wraps. + + Returns per-tensor inflight load counts. + """ + numP = len(self._partitions) + numK = len(self._partitions[0]) + flat_len = numP * numK + + consumer_flat = consumer_pi * numK + consumer_slot + + wraps_needed = abs(dep_ref.mt_offset) + + counts = WaitGRCounts() + wraps_completed = 0 + pos = consumer_flat + + max_steps = (wraps_needed + 1) * flat_len + for _ in range(max_steps): + pos = (pos - 1) % flat_len + if pos == flat_len - 1 and _ > 0: + wraps_completed += 1 + + pi = pos // numK + slot_k = pos % numK + slot = self._partitions[pi][slot_k] + + for gr in slot.grs: + if gr.tensor == tensor and gr is dep_ref.ref and wraps_completed >= wraps_needed: + return counts + gr_gran = self._gr_granularity(gr.tensor) + tiles = gr.tiles + n_tile = (tiles.tileId_end - tiles.tileId_start) // gr_gran.mn + n_k = (tiles.subIterK_end - tiles.subIterK_start) // gr_gran.k + cur = getattr(counts, gr.tensor) + setattr(counts, gr.tensor, cur + n_tile * n_k) + + return counts + + def remove_cross_deps(self): + """Replace cross-subIterK deps with wait preOps. + + For each placement, separates deps into same-subIterK (kept) and + cross-subIterK (converted to preOps): + - MFMA depending on LRs → single wait_lr + - GR depending on LRs → single wait_lr_sync + - LR depending on GRs → single wait_gr_sync with per-tensor inflight counts + """ + self._ensure_pass(Pass.REMOVE_LR_DEPS) + + for pi, slots in enumerate(self._partitions): + for slot in slots: + # ── MFMA ── + if slot.mfma: + same, cross = self._split_deps(slot.mfma.deps, pi, slot.subIterK) + slot.mfma.deps = same + slot.mfma.preOps = [] + if cross: + slot.mfma.preOps.append(WaitLROp()) + + # ── LRs ── + for lr in slot.lrs: + same, cross = self._split_deps(lr.deps, pi, lr.subIterK_slot) + lr.deps = same + lr.preOps = [] + if cross: + dep = cross[0] + counts = self._compute_inflight_loads( + pi, lr.subIterK_slot, dep.ref.tensor, dep) + lr.preOps.append(WaitGROp(wait_gr_counts=counts, + has_sync=True)) + + # ── GRs ── + for gr in slot.grs: + same, cross = self._split_deps(gr.deps, pi, gr.subIterK_slot) + gr.deps = same + has_lr_dep = any( + isinstance(d.ref, LRPlacement) + for d in same + cross) + gr.preOps = [WaitLROp(has_sync=True)] if has_lr_dep else [] + + self._completed.add(Pass.REMOVE_DEPS) + + def insert_gr_lr_inc(self): + """Insert gr_inc/lr_inc preOps at MacroTile iteration transitions. + + Walks all LR and GR placements in global execution order + (partition 0 slots → partition 1 slots → ..., within each slot: LR then GR). + Tracks per-tensor the last-seen mtIteration. When a tensor's mtIteration + changes, inserts a BaseOp into that placement's preOps: + - lr_inc for LR placements + - gr_inc for GR placements + """ + self._ensure_pass(Pass.REMOVE_DEPS) + + last_lr_mt = {} # tensor -> mtIteration for LR only + last_gr_mt = {} # tensor -> mtIteration for GR only + first_lr = {} # tensor -> first LR placement seen + lr_inc_tensors = set() # tensors that already received lr_inc + + for pi, slots in enumerate(self._partitions): + for slot in slots: + for lr in slot.lrs: + tensor = lr.tensor + mt = lr.mtIteration + if tensor not in first_lr: + first_lr[tensor] = lr + if tensor in last_lr_mt and last_lr_mt[tensor] != mt: + lr.preOps.append(LRIncOp(tensor=tensor)) + lr_inc_tensors.add(tensor) + last_lr_mt[tensor] = mt + for gr in slot.grs: + tensor = gr.tensor + mt = gr.mtIteration + prev_mt = last_gr_mt.get(tensor, last_lr_mt.get(tensor)) + if prev_mt is not None and prev_mt != mt: + if gr.tiles.tileId_start == 0: + gr.preOps.append(GRIncOp(tensor=tensor)) + last_gr_mt[tensor] = mt + + # Handle wrap-around: tensors with a single LR per iteration (e.g. SA, SB) + # still need lr_inc because the GR at end-of-iteration writes to the other + # LDS buffer, and the next iteration's LR must swap to read from it. + # Safe: preOps are consumed by emit(), not during this walk. + for tensor, lr in first_lr.items(): + if tensor not in lr_inc_tensors: + last = last_gr_mt.get(tensor, last_lr_mt.get(tensor)) + if last is not None and last != lr.mtIteration: + lr.preOps.append(LRIncOp(tensor=tensor)) + + self._completed.add(Pass.GR_INC) + + # ── Group LR/GR chains ───────────────────────────────────── + + _LR_GR_ORDER = ['A', 'B', 'SA', 'SB'] + + @staticmethod + def _merge_preops(all_preops: List[List['BaseOp']]) -> List['BaseOp']: + """Merge preOps from multiple placements. + + Combines wait_gr/wait_gr_sync counts into a single BaseOp, deduplicates barrier ops + (wait_lr_sync, wait_lr), and collects the rest. + """ + wait_gr_ops = [] + has_wait_gr_sync = False + seen_wait_lr = False + others = [] + for preops in all_preops: + for op in preops: + if isinstance(op, WaitGROp) and op.wait_gr_counts: + if op.has_sync: + has_wait_gr_sync = True + wait_gr_ops.append(op.wait_gr_counts) + elif isinstance(op, WaitLROp): + if not seen_wait_lr: + seen_wait_lr = True + others.append(op) + else: + others.append(op) + result = [] + if wait_gr_ops: + merged_counts = WaitGRCounts() + for t in ('A', 'B', 'SA', 'SB'): + setattr(merged_counts, t, min(getattr(c, t) for c in wait_gr_ops)) + result.append(WaitGROp(wait_gr_counts=merged_counts, + has_sync=has_wait_gr_sync)) + result.extend(others) + return result + + def group_lr_gr(self): + """Group LR and GR placements into chains within each subIterK. + + Phase 1 — LR chain: + Sort LRs by tensor order (A, B, SA, SB). Build a dep chain so each + LR depends on the previous one. Merge all preOps onto the first LR + (wait_gr counts are combined, other preOps are collected). + + Phase 2 — GR chain: + Sort GRs by tensor order (A, B, SA, SB). Build a dep chain. If any + GR originally had same-subIterK deps, replace the first GR's deps with + a single dep on the last LR of the phase-1 chain. Each GR keeps its + own preOps; only redundant wait_lr_sync ops are removed (keep the + first occurrence only). + """ + self._ensure_pass(Pass.GR_INC) + + order = self._LR_GR_ORDER + + for pi, slots in enumerate(self._partitions): + for slot in slots: + # ── Phase 1: LR chain ── + ordered_lrs = sorted( + slot.lrs, + key=lambda lr: order.index(lr.tensor)) + + if len(ordered_lrs) > 1: + # Merge preOps onto first LR + merged = self._merge_preops( + [lr.preOps for lr in ordered_lrs]) + ordered_lrs[0].preOps = merged + for lr in ordered_lrs[1:]: + lr.preOps = [] + + # Build chain: each LR depends on the previous + for i in range(1, len(ordered_lrs)): + ordered_lrs[i].deps = [ + Dep(ref=ordered_lrs[i - 1], mt_offset=0)] + + last_lr = ordered_lrs[-1] if ordered_lrs else None + + # ── Phase 2: GR chain ── + ordered_grs = sorted( + slot.grs, + key=lambda gr: order.index(gr.tensor)) + + if len(ordered_grs) > 1: + # Check if any GR has same-subIterK deps + any_deps = any(gr.deps for gr in ordered_grs) + + # Remove redundant wait_lr_sync (keep only the first) + seen_wait_lr_sync = False + for gr in ordered_grs: + if seen_wait_lr_sync: + gr.preOps = [ + op for op in gr.preOps + if not (isinstance(op, WaitLROp) and op.has_sync)] + elif any(isinstance(op, WaitLROp) and op.has_sync + for op in gr.preOps): + seen_wait_lr_sync = True + + # First GR: if any GR had deps, point to last LR + if any_deps and last_lr is not None: + ordered_grs[0].deps = [ + Dep(ref=last_lr, mt_offset=0)] + else: + ordered_grs[0].deps = [] + + # Build chain: each GR depends on the previous + for i in range(1, len(ordered_grs)): + ordered_grs[i].deps = [ + Dep(ref=ordered_grs[i - 1], mt_offset=0)] + elif len(ordered_grs) == 1: + # Single GR: still consolidate dep to last LR if it had deps + if ordered_grs[0].deps and last_lr is not None: + ordered_grs[0].deps = [ + Dep(ref=last_lr, mt_offset=0)] + + self._completed.add(Pass.GROUP_LR_GR) + + def remove_unnecessary_wait_lr_sync(self): + """Remove redundant wait_lr_sync from GRs after grouping. + Given that we always use wait_lr cnt=0, grouping can guarantee future wait_lr_sync. + + A GR's wait_lr_sync is unnecessary when: + 1. The GR has no same-subIterK deps (deps is empty after grouping) + 2. The previous subIterK's GRs already have a wait_lr_sync + 3. That previous wait_lr_sync is ordered after all LRs in the + previous subIterK (the GR has deps on the LR chain) + + In that case, all prior LR reads were already synced by the previous + subIterK's barrier, and the current GR doesn't conflict with any LRs + in its own subIterK, so the second wait_lr_sync is redundant. + + Finally, any remaining wait_lr_sync on a GR with no deps is downgraded + to just sync — the wait_lr is already guaranteed by the MFMA op in the + same subIterK. + """ + self._ensure_pass(Pass.GROUP_LR_GR) + + for pi, slots in enumerate(self._partitions): + for si, slot in enumerate(slots): + if not slot.grs: + continue + first_gr = slot.grs[0] + has_wait_lr_sync = any( + isinstance(op, WaitLROp) and op.has_sync for op in first_gr.preOps) + if not has_wait_lr_sync: + continue + has_deps = bool(first_gr.deps) + if has_deps: + continue + # Check previous subIterK in the same partition + if si == 0: + continue + prev_slot = slots[si - 1] + if not prev_slot.grs: + continue + prev_first_gr = prev_slot.grs[0] + prev_has_wait_lr_sync = any( + isinstance(op, WaitLROp) and op.has_sync for op in prev_first_gr.preOps) + prev_deps_on_lrs = bool(prev_first_gr.deps) + if prev_has_wait_lr_sync and prev_deps_on_lrs: + first_gr.preOps = [ + op for op in first_gr.preOps + if not (isinstance(op, WaitLROp) and op.has_sync)] + + # Downgrade remaining wait_lr_sync → sync on GRs with no LR deps. + # The MFMA in the same subIterK already ensures wait_lr. + for pi, slots in enumerate(self._partitions): + for slot in slots: + for gr in slot.grs: + if not any(isinstance(op, WaitLROp) and op.has_sync for op in gr.preOps): + continue + has_lr_dep = False + node = gr + while node and node.deps: + ref = node.deps[0].ref + if isinstance(ref, LRPlacement): + has_lr_dep = True + break + node = ref + if has_lr_dep: + continue + gr.preOps = [ + SyncOp() if (isinstance(op, WaitLROp) and op.has_sync) else op + for op in gr.preOps] + + self._completed.add(Pass.REMOVE_WAIT_LR_SYNC) + + def _split_deps(self, deps: List[Dep], consumer_pi: int, + consumer_slot: int) -> Tuple[List[Dep], List[Dep]]: + """Split deps into same-subIterK and cross-subIterK lists. + + A dep is "same subIterK" if mt_offset == 0 AND the producer is in the + same partition and same subIterK slot as the consumer. + """ + same, cross = [], [] + for dep in deps: + if (dep.mt_offset == 0 and + dep.ref.partition == consumer_pi and + dep.ref.subIterK_slot == consumer_slot): + same.append(dep) + else: + cross.append(dep) + return same, cross + + def emit(self) -> List[List[List[EmittedModule]]]: + """Convert placements into EmittedModule chains per partition per subIterK. + + Returns [partition][subIterK][EmittedModule]. + + Each subIterK list contains: + - Primary modules (MFMA, LRs, GRs) + - Dependency modules (wait_gr, wait_lr, sync, lr_inc, gr_inc) + emitted from preOps, chained via before-links + + The before-link topology: + - wait_gr is standalone (no incoming before-link), but later deps chain from it + - WaitGROp with has_sync expands to two modules: wait_gr then sync + - WaitLROp with has_sync expands to two modules: wait_lr then sync + - Same-subIterK Dep deps become ordering constraints (no new module) + """ + self._ensure_pass(Pass.REMOVE_WAIT_LR_SYNC) + + all_partitions = [] + for pi, slots in enumerate(self._partitions): + partition_emitted = [] + for slot in slots: + emitted: List[EmittedModule] = [] + placement_to_id = {} + + def add(source: Emittable) -> int: + mid = len(emitted) + emitted.append(EmittedModule(moduleId=mid, source=source)) + return mid + + def setBefore(moduleId: int, beforeId: int) -> None: + if beforeId is None or beforeId == moduleId: + return + cur = emitted[moduleId].before + if cur is None: + emitted[moduleId].before = beforeId + return + assert cur == beforeId, \ + f"EmittedModule {moduleId} has multiple before deps: {cur} and {beforeId}" + + # Step 1: emit primary modules + placements = [] + if slot.mfma: + placements.append(slot.mfma) + for lr in slot.lrs: + placements.append(lr) + for gr in slot.grs: + placements.append(gr) + + for placement in placements: + mid = add(placement) + placement_to_id[id(placement)] = mid + + # Step 2: wire before-chains from preOps + deps + for placement in placements: + curId = placement_to_id[id(placement)] + prevId = None + lastDepId = None + firstPreOpId = None + + # preOps + for preOp in placement.preOps: + if isinstance(preOp, WaitGROp): + depId = add(preOp) + prevId = depId + if firstPreOpId is None: + firstPreOpId = depId + if preOp.has_sync: + depId = add(SyncOp()) + setBefore(depId, prevId) + prevId = depId + lastDepId = depId + continue + elif isinstance(preOp, WaitLROp) and preOp.has_sync: + depId = add(WaitLROp()) + setBefore(depId, prevId) + prevId = depId + lastDepId = depId + if firstPreOpId is None: + firstPreOpId = depId + depId = add(SyncOp()) + setBefore(depId, prevId) + prevId = depId + lastDepId = depId + continue + else: + depId = add(preOp) + setBefore(depId, prevId) + prevId = depId + lastDepId = depId + if firstPreOpId is None: + firstPreOpId = depId + + # deps (same-subIterK Deps — ordering constraints) + # Wire dep refs as roots of the preOp chain so the + # dependency is not lost when preOps are present. + for dep in placement.deps: + ref_id = placement_to_id.get(id(dep.ref)) + if ref_id is not None: + if firstPreOpId is not None: + setBefore(firstPreOpId, ref_id) + else: + prevId = ref_id + + # Final link: primary module points to last dep + if lastDepId is not None: + setBefore(curId, lastDepId) + elif prevId is not None: + setBefore(curId, prevId) + + partition_emitted.append(emitted) + all_partitions.append(partition_emitted) + + self._emitted = all_partitions + self._completed.add(Pass.EMIT) + return all_partitions + + def build(self): + """Build mainloop """ + self.emit() + self._completed.add(Pass.BUILD) + + # ── Loop variant derivation ──────────────────────────── + + @staticmethod + def _rewire_before(emitted: List[EmittedModule], + removed_ids: set) -> List[EmittedModule]: + """Rewire before-links that point to removed modules. + + If em.before points to a removed module, follow that module's own + before link until we find a non-removed module (or None). + """ + id_to_em = {em.moduleId: em for em in emitted} + for em in emitted: + if em.moduleId in removed_ids: + continue + b = em.before + while b is not None and b in removed_ids: + b = id_to_em[b].before + em.before = b + return [em for em in emitted if em.moduleId not in removed_ids] + + def build_ngll(self) -> List[List[List[EmittedModule]]]: + """NGLL (No Global Load Loop): mainloop without GR(n+2), GR_INC. + + WaitGR inflight counts are zeroed since no new GRs are in flight. + """ + self._ensure_pass(Pass.EMIT) + + ngll = [] + for partition_emitted in self._emitted: + part_ngll = [] + for emitted in partition_emitted: + new_emitted = copy.deepcopy(emitted) + removed = set() + for em in new_emitted: + src = em.source + if em.opType == 'gr' and src.mtIteration == 2: + removed.add(em.moduleId) + elif em.opType == 'gr_inc': + removed.add(em.moduleId) + elif em.opType == 'wait_gr': + if src.wait_gr_counts is not None: + src.wait_gr_counts = WaitGRCounts() + part_ngll.append(self._rewire_before(new_emitted, removed)) + ngll.append(part_ngll) + + self._ngll_emitted = ngll + return ngll + + def build_nll(self) -> List[List[List[EmittedModule]]]: + """NLL (No Load Loop): mainloop without GR, LR(n+1), GR_INC, LR_INC, + WaitGR(n+1)+Sync. Keeps LR(n), MFMAs, WaitGR(n) with zeroed counts.""" + self._ensure_pass(Pass.EMIT) + + nll = [] + for partition_emitted in self._emitted: + part_nll = [] + for emitted in partition_emitted: + new_emitted = copy.deepcopy(emitted) + removed = set() + + for em in new_emitted: + src = em.source + if em.opType == 'gr': + removed.add(em.moduleId) + elif em.opType == 'lr' and src.mtIteration == 1: + removed.add(em.moduleId) + elif em.opType in ('gr_inc', 'lr_inc'): + removed.add(em.moduleId) + + # Zero inflight counts on remaining WaitGR. + for em in new_emitted: + if em.opType == 'wait_gr' and em.moduleId not in removed: + em.source.wait_gr_counts = WaitGRCounts() + + # Find Sync modules paired with removed wait_gr + for em in new_emitted: + if em.opType == 'sync' and em.before is not None \ + and em.before in removed: + removed.add(em.moduleId) + + # Remove WaitLR if no LR remains in this subIterK + has_lr = any(em.opType == 'lr' and em.moduleId not in removed + for em in new_emitted) + if not has_lr: + for em in new_emitted: + if em.opType == 'wait_lr': + removed.add(em.moduleId) + + part_nll.append(self._rewire_before(new_emitted, removed)) + nll.append(part_nll) + + self._nll_emitted = nll + return nll + + @staticmethod + def _to_emitted(ops) -> List[EmittedModule]: + """Wrap Emittable objects (Placements / BaseOps) into EmittedModules.""" + return [EmittedModule(moduleId=mid, source=op) for mid, op in enumerate(ops)] + + def _preloop_make_gr(self, mt: str, tiles: dict) -> List[GRPlacement]: + """Create GR placements for all tensors at the given MT iteration. + + tiles: {'A': MFMATileRange, 'B': MFMATileRange} + """ + return [GRPlacement(tensor=tensor, mtIteration=mt, + tiles=tiles['A' if tensor in ('A', 'SA') else 'B'], + subIterK_slot=0) + for tensor in self.tensors] + + def _preloop_make_lr(self, tiles: dict) -> List[LRPlacement]: + """Create LR placements for first partition. + + tiles: per-tensor MFMATileRange, e.g. {'A': MFMATileRange(0, k, mn0, mn1), ...} + + Uses the first MFMA's vgpr tile maps (the preloop loads data consumed + by the first MFMA, not the next subIterK like mainloop LRs). + """ + first_mfma = self._partitions[0][0].mfma + + placements = [] + for tensor in self.tensors: + lr = LRPlacement( + tensor=tensor, mtIteration=0, + tiles=tiles[tensor], + subIterK_slot=0, partition=0) + if tensor in first_mfma.vgpr_tile_maps: + lr.vgpr_tile_map = copy.deepcopy(first_mfma.vgpr_tile_maps[tensor]) + placements.append(lr) + return placements + + def _make_tensor_depops(self, cls) -> List[BaseOp]: + """Create a BaseOp subclass instance for each tensor.""" + return [cls(tensor=tensor) for tensor in self.tensors] + + def build_preloop(self) -> List[List[List[EmittedModule]]]: + """Build preloop: pipeline initialization sequence before mainloop. + + High-level sequence (waits/syncs auto-inserted by _insert_preloop_waits): + GR(MT 0) — all tensors, all tiles + GR_INC + LR — first partition, subIterK=0 + LR_INC + skip(LE 1, NLLEarly/NLL) + GR(MT 1) — first partition tiles + GR_INC + skip(LE 2, NGLL) + + Returns [1 partition][1 subIterK][EmittedModules] to match emit() shape. + """ + cfg = self.config + numK = cfg.numSubIterK + part0 = self._partition_tile_range(0) + all_tiles = { + 'A': MFMATileRange(0, numK, 0, cfg.numMFMATilesM), + 'B': MFMATileRange(0, numK, 0, cfg.numMFMATilesN), + } + part0_tiles = { + 'A': MFMATileRange(0, numK, *part0['A']), + 'B': MFMATileRange(0, numK, *part0['B']), + } + lr_tiles = { + 'A': MFMATileRange(0, cfg.lrA.k, *part0['A']), + 'B': MFMATileRange(0, cfg.lrB.k, *part0['B']), + } + if cfg.hasScale: + lr_tiles['SA'] = MFMATileRange(0, cfg.lrSA.k, *part0['A']) + lr_tiles['SB'] = MFMATileRange(0, cfg.lrSB.k, *part0['B']) + + emitted = self._to_emitted([ + *self._preloop_make_gr(0, all_tiles), + *self._make_tensor_depops(GRIncOp), + WaitGROp(wait_gr_counts=WaitGRCounts()), + SyncOp(), + *self._preloop_make_lr(lr_tiles), + WaitLROp(), + SkipOp(compare='LE', value=1, target='NLL'), + *self._preloop_make_gr(1, part0_tiles), + # *self._make_tensor_depops(GRIncOp), + SkipOp(compare='LE', value=2, target='NGLL'), + ]) + + self._preloop_emitted = [[emitted]] + return self._preloop_emitted + + def _emitLoop(self, writer, kernel, label, emitted_3d): + """Emit a loop section from a 3D emitted structure. + + emitted_3d: [partition][subIterK][EmittedModule] + + For subIterKs with MFMAs: calls instructionSchedule for interleaving. + For subIterKs without MFMAs (preloop): emits instructions sequentially. + """ + from Tensile.Components.SubtileBasedInstructionScheduler import instructionSchedule + from rocisa.code import Module + + module = Module(label) + module.addComment0(f"{label} start") + for pi, partition_emitted in enumerate(emitted_3d): + for k, em_list in enumerate(partition_emitted): + module.addComment0(f"partition={pi} subIterK={k}") + hasMFMA = any(em.opType == 'mfma' for em in em_list) + if hasMFMA: + scheduled = instructionSchedule(em_list) + module.add(scheduled) + else: + for em in em_list: + for inst in em.instructions: + module.add(inst) + module.addComment0(f"{label} end") + return module + + def emitAllLoops(self, writer, kernel): + """Emit complete loop structure: preloop + mainloop + NGLL + NLL. + + Owns all control flow (labels, branches, counter management). + For unroll_factor > 1, emits per-unroll copies with correct vgpr tiles. + Each mainloop exit jumps to its corresponding NGLL→NLL pair. + """ + from rocisa.code import Module, Label + from rocisa.instruction import (SSubU32, SCmpEQU32, SCBranchSCC0, + SCBranchSCC1, SBranch) + from rocisa.container import sgpr + + assert Pass.POPULATE in self._completed, \ + "populate_instructions() must be called before emitAllLoops()" + + module = Module("AllLoops") + uf = self.unroll_factor + + # ── Preloop ── + module.add(self._emitLoop(writer, kernel, "PRELOOP", + self._preloop_emitted)) + + # ── Mainloop ── + module.addComment0("MAINLOOP") + loopBegin = Label("LoopBeginL", "") + + if uf == 1: + module.add(loopBegin) + module.add(self._emitLoop(writer, kernel, "MAINLOOP", + self._emitted_per_unroll[0])) + module.add(SSubU32(dst=sgpr("LoopCounterL"), + src0=sgpr("LoopCounterL"), src1=1, + comment="dec counterL")) + module.add(SCmpEQU32(src0=sgpr("LoopCounterL"), src1=2, + comment="counterL == 2?")) + module.add(SCBranchSCC0(labelName=loopBegin.getLabelName(), + comment="restart mainloop")) + else: + exitLabels = [Label(f"ExitC{ui}", "") for ui in range(uf - 1)] + module.add(loopBegin) + for ui in range(uf): + module.add(self._emitLoop(writer, kernel, f"MAINLOOP_C{ui}", + self._emitted_per_unroll[ui])) + module.add(SSubU32(dst=sgpr("LoopCounterL"), + src0=sgpr("LoopCounterL"), src1=1, + comment=f"dec counterL (copy {ui})")) + module.add(SCmpEQU32(src0=sgpr("LoopCounterL"), src1=2, + comment=f"counterL == 2? (copy {ui} exit)")) + if ui < uf - 1: + module.add(SCBranchSCC1( + labelName=exitLabels[ui].getLabelName(), + comment=f"copy {ui} exit → NGLL_C{ui}")) + else: + module.add(SCBranchSCC0( + labelName=loopBegin.getLabelName(), + comment="restart mainloop")) + + # ── NGLL + NLL exit paths ── + endLabel = Label("SkipToEnd", "") + module.add(Label("SkipMainloop", "")) + module.add(Label("SkipToNGLL", "")) + + if uf == 1: + module.addComment0("NGLL") + module.add(self._emitLoop(writer, kernel, "NGLL", + self._ngll_per_unroll[0])) + module.addComment0("NLL") + module.add(Label("SkipToNLL", "")) + module.add(self._emitLoop(writer, kernel, "NLL", + self._nll_per_unroll[0])) + else: + # Fall-through from last mainloop copy + last = uf - 1 + module.addComment0(f"NGLL_C{last}") + module.add(self._emitLoop(writer, kernel, f"NGLL_C{last}", + self._ngll_per_unroll[last])) + module.addComment0(f"NLL_C{last}") + module.add(self._emitLoop(writer, kernel, f"NLL_C{last}", + self._nll_per_unroll[last])) + module.add(SBranch(labelName=endLabel.getLabelName(), + comment="skip other exit paths")) + + for ui in range(uf - 1): + module.add(exitLabels[ui]) + module.addComment0(f"NGLL_C{ui}") + module.add(self._emitLoop(writer, kernel, f"NGLL_C{ui}", + self._ngll_per_unroll[ui])) + module.addComment0(f"NLL_C{ui}") + module.add(self._emitLoop(writer, kernel, f"NLL_C{ui}", + self._nll_per_unroll[ui])) + if ui < uf - 2: + module.add(SBranch(labelName=endLabel.getLabelName(), + comment="skip other exit paths")) + + # NLLEarly: reached from preloop when LoopCounterL <= 1 + module.add(SBranch(labelName=endLabel.getLabelName(), + comment="skip NLLEarly")) + module.addComment0("NLLEarly") + module.add(Label("SkipToNLL", "")) + module.add(self._emitLoop(writer, kernel, "NLLEarly", + self._nll_per_unroll[0])) + module.add(endLabel) + + return module + + # ── VGPR tile allocation ────────────────────────────── + + def getNumVgpr(self, tileInfoA, tileInfoB, + scaleTileInfoA=None, scaleTileInfoB=None) -> int: + """Return the total number of VGPRs needed across all tensors (A, B, SA, SB) + without performing any allocation. + + Must be called after scheduling is complete. + """ + self._ensure_pass(Pass.VGPR_TILES) + + cfg = self.config + + def _tile_vgpr_count(tileInfo, lrGran): + return int(math.ceil(tileInfo.mmaTileRegCount * lrGran.k * lrGran.mn)) + + total = self.tile_peaks.get('A', 0) * _tile_vgpr_count(tileInfoA, cfg.lrA) \ + + self.tile_peaks.get('B', 0) * _tile_vgpr_count(tileInfoB, cfg.lrB) + + if cfg.hasScale and scaleTileInfoA and scaleTileInfoB: + total += self.tile_peaks.get('SA', 0) * _tile_vgpr_count(scaleTileInfoA, cfg.lrSA) \ + + self.tile_peaks.get('SB', 0) * _tile_vgpr_count(scaleTileInfoB, cfg.lrSB) + + return total + + def allocVgprTiles(self, writer, tileInfoA, tileInfoB, + scaleTileInfoA=None, scaleTileInfoB=None): + """Allocate physical VGPR tiles based on assign_vgpr_tiles() peaks. + + Each vgprTile holds one LR granularity worth of data: + size = ceil(mmaTileRegCount * lrGranularity.k * lrGranularity.mn) + + Ex: 4 VGPRs for A/B for 1 MFMATile, and 1 VGPR for a 2x2 MFMA tile for SA/SB if hasScale. + + Produces per-tensor lists indexed by vgprTileId: + vgprTilesA/B: List[RegisterTileInfo] + vgprTilesSA/SB: List[RegisterTileInfo] + """ + self._ensure_pass(Pass.VGPR_TILES) + + from Tensile.Components.SubtileBasedKernel import TileInfo + + cfg = self.config + + def _tile_vgpr_count(tileInfo, lrGran): + return int(math.ceil(tileInfo.mmaTileRegCount * lrGran.k * lrGran.mn)) + + def _alloc_tiles(count, numRegs): + tiles = [] + for _ in range(count): + tile = TileInfo.RegisterTileInfo(writer.vgprPool) + for j in range(0, numRegs, 4): + blockSize = min(4, numRegs - j) + vstart = writer.vgprPool.checkOutAligned(blockSize, blockSize) + for k in range(blockSize): + tile.append(vstart + k) + tiles.append(tile) + return tiles + + self.vgprTilesA = _alloc_tiles(self.tile_peaks.get('A', 0), + _tile_vgpr_count(tileInfoA, cfg.lrA)) + self.vgprTilesB = _alloc_tiles(self.tile_peaks.get('B', 0), + _tile_vgpr_count(tileInfoB, cfg.lrB)) + + if cfg.hasScale and scaleTileInfoA and scaleTileInfoB: + self.vgprTilesSA = _alloc_tiles(self.tile_peaks.get('SA', 0), + _tile_vgpr_count(scaleTileInfoA, cfg.lrSA)) + self.vgprTilesSB = _alloc_tiles(self.tile_peaks.get('SB', 0), + _tile_vgpr_count(scaleTileInfoB, cfg.lrSB)) + else: + self.vgprTilesSA = [] + self.vgprTilesSB = [] + + def deallocVgprTiles(self, writer): + """Deallocate VGPR tiles allocated by allocVgprTiles.""" + def _dealloc_tiles(tiles): + for tile in tiles: + pool = tile.regList.regPool + for val in tile: + if tile.index(val) % 4 == 0: + pool.checkIn(val) + + _dealloc_tiles(self.vgprTilesA) + _dealloc_tiles(self.vgprTilesB) + _dealloc_tiles(self.vgprTilesSA) + _dealloc_tiles(self.vgprTilesSB) + self.vgprTilesA = [] + self.vgprTilesB = [] + self.vgprTilesSA = [] + self.vgprTilesSB = [] + + # ── Populate instructions ────────────────────────────── + + def populate_instructions(self, writer, kernel, + tileInfoA, tileInfoB, dtileInfo, + scaleTileInfoA=None, scaleTileInfoB=None) -> None: + """Populate EmittedModule.instructions from placements and preOps. + + Uses per-tensor VGPR tile lists (vgprTilesA/B/SA/SB) indexed by + vgprTileId from placement tile maps. + """ + if self._preloop_emitted is None or self._ngll_emitted is None \ + or self._nll_emitted is None: + self.build() + + from Tensile.Components.SubtileBasedInstructionEmitter import InstructionEmitter + + emitter = InstructionEmitter( + writer, kernel, self.config, + tileInfoA, tileInfoB, dtileInfo, + self.vgprTilesA, self.vgprTilesB, + scaleTileInfoA, scaleTileInfoB, + self.vgprTilesSA, self.vgprTilesSB, + ) + + # Rebuild all loop variants from current _emitted (which now has + # vgpr_tile_maps populated by assign_vgpr_tiles, unlike the stale + # copies from build()). + self.build_preloop() + self.build_ngll() + self.build_nll() + + emitter.populate(self._preloop_emitted, unroll_iter=0) + + self._emitted_per_unroll = [] + self._ngll_per_unroll = [] + self._nll_per_unroll = [] + for ui in range(self.unroll_factor): + em_copy = copy.deepcopy(self._emitted) + emitter.populate(em_copy, unroll_iter=ui) + self._emitted_per_unroll.append(em_copy) + + ngll_copy = copy.deepcopy(self._ngll_emitted) + ngll_ui = (ui + 1) % self.unroll_factor + emitter.populate(ngll_copy, unroll_iter=ngll_ui) + self._ngll_per_unroll.append(ngll_copy) + + nll_copy = copy.deepcopy(self._nll_emitted) + nll_ui = (ui + 2) % self.unroll_factor + emitter.populate(nll_copy, unroll_iter=nll_ui) + self._nll_per_unroll.append(nll_copy) + + self._completed.add(Pass.POPULATE) + + # ── Print helpers ─────────────────────────────────────── + + @staticmethod + def _fmt_tensor(tensor: str) -> str: + """Pad tensor name to 2 chars for alignment: 'A' -> 'A ', 'SA' -> 'SA'.""" + return tensor.ljust(2) + + + def print_lr(self, partitions: List[List[SubIterKSlot]] = None) -> str: + """Print place_LRs output in design doc format.""" + if partitions is None: + partitions = self._partitions + buf = io.StringIO() + buf.write("MAINLOOP:\n") + for pi, slots in enumerate(partitions): + buf.write(f" Partition {pi}:\n") + self._print_lr_partition(buf, slots) + return buf.getvalue() + + def _print_lr_partition(self, buf, slots): + for slot in slots: + buf.write(f" subIterK={slot.subIterK}:\n") + if slot.mfma: + m = slot.mfma + buf.write(f" MFMAs (MT n, subIterK {m.subIterK} ) " + f"A : {m.tileA.fmt_tiles()} , B : {m.tileB.fmt_tiles()}\n") + for lr in slot.lrs: + t = self._fmt_tensor(lr.tensor) + buf.write(f" LR {t} (MT {fmt_mt(lr.mtIteration)}, " + f"subIterK {lr.tiles.fmt_k()}) " + f"{lr.tiles.fmt_tiles()}\n") + return buf.getvalue() + + def print_vgpr(self) -> str: + """Print assign_vgpr_tiles output: LRs + MFMAs with vgprTileId annotations.""" + partitions = self._partitions + buf = io.StringIO() + buf.write(f"needsUnrolling: {self.needs_unrolling}, " + f"unrollFactor: {self.unroll_factor}\n") + peaks_str = ", ".join(f"{t}: {cnt}" for t, cnt in sorted(self.tile_peaks.items())) + buf.write(f"vgprTiles: {peaks_str}\n") + for ui in range(self.unroll_factor): + if self.unroll_factor > 1: + buf.write(f"MAINLOOP (unroll {ui}):\n") + else: + buf.write("MAINLOOP:\n") + for pi, slots in enumerate(partitions): + buf.write(f" Partition {pi}:\n") + for slot in slots: + buf.write(f" subIterK={slot.subIterK}:\n") + if slot.mfma: + m = slot.mfma + tiles_str = "" + parts = [] + for tensor in self.tensors: + maps = m.vgpr_tile_maps.get(tensor) + if maps: + parts.append(f"{tensor}:" + str(maps[ui])) + if parts: + tiles_str = " " + ", ".join(parts) + buf.write(f" MFMAs (MT n, subIterK {m.subIterK} ) " + f"A : {m.tileA.fmt_tiles()} , " + f"B : {m.tileB.fmt_tiles()}{tiles_str}\n") + for lr in slot.lrs: + tile_str = "" + if lr.vgpr_tile_map: + tile_str = f" tiles:{lr.vgpr_tile_map[ui]}" + t = self._fmt_tensor(lr.tensor) + buf.write(f" LR {t} (MT {fmt_mt(lr.mtIteration)}, " + f"subIterK {lr.tiles.fmt_k()}) " + f"{lr.tiles.fmt_tiles()}{tile_str}\n") + return buf.getvalue() + + def print_gr(self) -> str: + """Print place_GRs output: LRs + MFMAs + GR placements, all partitions.""" + partitions = self._partitions + buf = io.StringIO() + buf.write("MAINLOOP:\n") + for pi, slots in enumerate(partitions): + buf.write(f" Partition {pi}:\n") + for slot in slots: + buf.write(f" subIterK={slot.subIterK}:\n") + if slot.mfma: + m = slot.mfma + buf.write(f" MFMAs (MT n, subIterK {m.subIterK} ) " + f"A : {m.tileA.fmt_tiles()} , " + f"B : {m.tileB.fmt_tiles()}\n") + for lr in slot.lrs: + t = self._fmt_tensor(lr.tensor) + buf.write(f" LR {t} (MT {fmt_mt(lr.mtIteration)}, " + f"subIterK {lr.tiles.fmt_k()}) " + f"{lr.tiles.fmt_tiles()}\n") + for gr in slot.grs: + buf.write(f" GR {gr.tensor} (MT {fmt_mt(gr.mtIteration)}, " + f"subIterK {gr.tiles.fmt_k()}) " + f"ids {gr.tiles.fmt_tiles()}\n") + return buf.getvalue() + + def print_deps(self) -> str: + """Print annotate_deps output: placements with their before-dependencies.""" + buf = io.StringIO() + buf.write("MAINLOOP:\n") + for pi, slots in enumerate(self._partitions): + buf.write(f" Partition {pi}:\n") + for slot in slots: + buf.write(f" subIterK={slot.subIterK}:\n") + if slot.mfma: + self._print_placement_with_deps(buf, slot.mfma, slot) + for lr in slot.lrs: + self._print_placement_with_deps(buf, lr, slot) + for gr in slot.grs: + self._print_placement_with_deps(buf, gr, slot) + return buf.getvalue() + + def _print_placement_with_deps(self, buf, placement, slot: SubIterKSlot): + """Print a placement label followed by its deps.""" + buf.write(f" {placement}\n") + if placement.deps: + buf.write(" deps:\n") + for dep in placement.deps: + dep_str = self._format_dep_ref(dep) + buf.write(f" - {dep_str}\n") + + def print_remove_deps(self) -> str: + """Print remove_cross_deps output: placements with preOps and remaining deps.""" + buf = io.StringIO() + buf.write("MAINLOOP:\n") + for pi, slots in enumerate(self._partitions): + buf.write(f" Partition {pi}:\n") + for slot in slots: + buf.write(f" subIterK={slot.subIterK}:\n") + if slot.mfma: + self._print_placement_with_preops(buf, slot.mfma, slot) + for lr in slot.lrs: + self._print_placement_with_preops(buf, lr, slot) + for gr in slot.grs: + self._print_placement_with_preops(buf, gr, slot) + return buf.getvalue() + + def print_group_lr_gr(self) -> str: + """Print group_lr_gr output: placements with chained deps and merged preOps.""" + buf = io.StringIO() + buf.write("MAINLOOP:\n") + for pi, slots in enumerate(self._partitions): + buf.write(f" Partition {pi}:\n") + for slot in slots: + buf.write(f" subIterK={slot.subIterK}:\n") + if slot.mfma: + self._print_placement_with_preops(buf, slot.mfma, slot) + for lr in slot.lrs: + self._print_placement_with_preops(buf, lr, slot) + for gr in slot.grs: + self._print_placement_with_preops(buf, gr, slot) + return buf.getvalue() + + def _print_placement_with_preops(self, buf, placement, slot: SubIterKSlot): + """Print a placement label followed by its preOps and remaining deps.""" + buf.write(f" {placement}\n") + if placement.preOps: + buf.write(" preOps:\n") + for op in placement.preOps: + buf.write(f" - {op}\n") + if placement.deps: + buf.write(" deps:\n") + for dep in placement.deps: + dep_str = self._format_dep_ref(dep) + buf.write(f" - {dep_str}\n") + + + def _format_dep_ref(self, dep: Dep) -> str: + """Format a Dep for display.""" + p = dep.ref + slot = p.subIterK_slot if hasattr(p, 'subIterK_slot') else '?' + part = p.partition if hasattr(p, 'partition') else 0 + kind = 'LR' if isinstance(p, LRPlacement) else 'GR' + mt = f" (MT{dep.mt_offset})" if dep.mt_offset != 0 else "" + return f"{kind} {p.tensor} @P{part}:subIterK={slot}{mt}" + + + def print_emit(self, all_partitions: List[List[List[EmittedModule]]] = None) -> str: + """Print emit output: EmittedModule list with before-links.""" + if all_partitions is None: + all_partitions = self._emitted + buf = io.StringIO() + buf.write("MAINLOOP:\n") + for pi, partition_emitted in enumerate(all_partitions): + buf.write(f" Partition {pi}:\n") + for k, emitted in enumerate(partition_emitted): + buf.write(f" subIterK={k}:\n") + for em in emitted: + before_str = f" <- [{em.before}]" if em.before is not None else "" + buf.write(f" [{em.moduleId:2d}] {em.opType:10s} {em.source}{before_str}\n") + return buf.getvalue() + + def print_emit_dep_order(self, all_partitions: List[List[List[EmittedModule]]] = None) -> str: + """Print emit output as dependency paths (same decomposition as _extractPathsFromBeforeDeps).""" + from Tensile.Components.SubtileBasedInstructionScheduler import extractPathsFromBeforeDeps + if all_partitions is None: + all_partitions = self._emitted + buf = io.StringIO() + buf.write("MAINLOOP (dependency paths):\n") + for pi, partition_emitted in enumerate(all_partitions): + buf.write(f" Partition {pi}:\n") + for k, emitted in enumerate(partition_emitted): + buf.write(f" subIterK={k}:\n") + mfmaIdx, paths, preMfmaPaths = extractPathsFromBeforeDeps(emitted) + em = emitted[mfmaIdx] + buf.write(f" MFMA: [{em.moduleId:2d}] {em.source}") + if em.before is not None: + buf.write(f" <- [{em.before}]") + buf.write("\n") + for i, path in enumerate(preMfmaPaths): + buf.write(f" preMFMA path {i}:\n") + for idx in path: + buf.write(f" [{emitted[idx].moduleId:2d}] {emitted[idx].opType:10s} {emitted[idx].source}\n") + for i, path in enumerate(paths): + buf.write(f" path {i}:\n") + for idx in path: + buf.write(f" [{emitted[idx].moduleId:2d}] {emitted[idx].opType:10s} {emitted[idx].source}\n") + return buf.getvalue() diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/WorkGroupMappingAlgos.py b/projects/hipblaslt/tensilelite/Tensile/Components/WorkGroupMappingAlgos.py index c0f28b856c5..4d4056f80fc 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/WorkGroupMappingAlgos.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/WorkGroupMappingAlgos.py @@ -109,7 +109,7 @@ def wgmXCC(writer, kernel, tmpSgprNumWorkGroups): Use chiplet_transform_chunk, skip classic wgmxcc remapping """ SgprIndex = "WorkGroup0" - SgprChunkSize = writer.sgprPool.checkOut(1) + SgprChunkSize = writer.sgprPool.checkOut(1, preventOverflow=False) module.add(SLShiftRightB32(dst=sgpr(SgprChunkSize), shiftHex=hex(22), src=sgpr(sgprWGM), comment="Get WGMCHUNK")) module.add(SAndB32(dst=sgpr(SgprChunkSize), src0=sgpr(SgprChunkSize), src1=hex(1023), comment="Get WGMCHUNK")) module.addComment0("remap WGs if WGMCHUNK > 1") diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py index 11831e52fd5..84fe748190e 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py @@ -24,7 +24,7 @@ from rocisa import rocIsa, countInstruction, countGlobalRead, \ countLocalRead, countLocalWrite, countWeightedLocalRead, countWeightedLocalWrite, getMFMAs -from rocisa.code import Module, TextBlock, StructuredModule, KernelBody +from rocisa.code import Module, TextBlock, StructuredModule, KernelBody, RegSet from rocisa.container import RegisterContainer, replaceHolder, HWRegContainer, VCC, MemTokenData from rocisa.label import LabelManager from rocisa.asmpass import rocIsaPass, rocIsaPassOption @@ -35,9 +35,9 @@ DSLoadU8, DSStore2B32, DSStore2B64, DSStoreB128, DSStoreB16, DSStoreB96, DSStoreB256, \ DSStoreB32, DSStoreB64, DSStoreB8, DSStoreInstruction, FlatLoadB128, FlatLoadB192, FlatLoadB32, \ FlatLoadB64, FlatStoreB128, FlatStoreB32, FlatStoreB64, Instruction, MacroInstruction, \ - MXMFMAInstruction, MFMAInstruction, SBarrier, SBranch, SCBranchSCC0, SCBranchSCC1, SCBranchVCCNZ, SCmpEQU32, SCmpLeU32, \ - SMFMAInstruction, SNop, SSetPrior, SSetRegIMM32B32, SSubU32, SWaitCnt, SWaitAlu, \ - SLongBranchPositive, VFmaMixF32, VMadMixF32, VMovB32, VAndB32, VCmpEQU32, VCndMaskB32, VMovB64, VNop + MFMAInstruction, MXMFMAInstruction, SBarrier, SBranch, SCBranchSCC0, SCBranchSCC1, SCBranchVCCNZ, SCmpEQU32, SCmpLeU32, \ + SMFMAInstruction, SNop, SEndpgm, SSetPrior, SSetRegIMM32B32, SSubU32, SWaitCnt, SWaitAlu, \ + SLongBranchPositive, VFmaMixF32, VMadMixF32, VMovB32, VAndB32, VCmpEQU32, VCndMaskB32, VMovB64, VNop, Instruction from rocisa.register import RegisterPool from rocisa.enum import RegisterType, DataTypeEnum @@ -45,6 +45,7 @@ from .Component import Component, LraTileProperties from .Components.Signature import UserArgumentsInfo from .Components.CustomSchedule import customMainLoopSchedule +from .Components.SubtileBasedKernel import * from .SolutionStructs import Solution, isPackedIndex from .SolutionStructs.Utilities import getMiInputType from .AsmMemoryInstruction import MemoryInstruction @@ -65,6 +66,10 @@ from dataclasses import dataclass, field from typing import Dict, List, NamedTuple, Optional,Tuple, Type from math import ceil, prod +import itertools + +# TODO: DEBUG ONLY, remove later +from pprint import pprint # Make const values immutable @dataclass(frozen=True) @@ -86,6 +91,9 @@ class MatrixInfo: startVgprValuPackTemp: int = -1 numSgprStrides: int = -1 + tileInfo: TileInfo = field(init=False) + + @dataclass class ABMatrixInfo(MatrixInfo): @@ -238,6 +246,8 @@ class StateValues: useAtomicAdd: bool = False serializedStore: bool = False + scheduleInfo: ScheduleInfo = field(init=False) + a: ABMatrixInfo = field(default_factory=ABMatrixInfo) b: ABMatrixInfo = field(default_factory=ABMatrixInfo) mxsa: ABMatrixInfo = field(default_factory=ABMatrixInfo) @@ -262,7 +272,7 @@ class StateValues: startVgprSerial: int = -1 startVgprSKConsts: int = -1 numVgprSKConsts: int = 0 - startVgprIdentityMatrix: int = -1 + startVgprIdentityMatrix: int = -1 numSgprSizesSum: int = 0 numSgprSizesFree: int = 0 @@ -313,6 +323,12 @@ class StateValues: savedLocalReadDoCntMXSB: int = 0 savedLocalReadDoCntMetadata: int = 0 + ldsStartOffsetA: int = -1 + ldsStartOffsetB: int = -1 + ldsStartOffsetMXSA: int = -1 + ldsStartOffsetMXSB: int = -1 + ldsTotalSize: int = 0 + dtvKIntervalA: int = 1 dtvKIntervalB: int = 1 ## MFMA @@ -2489,7 +2505,7 @@ def setupNewTile(self, kernel, tensorParametersA, tensorParametersB, isOptNLL=Fa module.addComment2("Begin setupNewTile") # work-group assignments - module.addComment1("global read addresses: work-group") + module.addComment1("global read addresses: work-group") # is this comment needed? if not forceNoTileCode: module.add(self.graWorkGroup(kernel, tensorParametersA, tensorParametersB)) @@ -2504,369 +2520,371 @@ def setupNewTile(self, kernel, tensorParametersA, tensorParametersB, isOptNLL=Fa tPMRef = tensorParametersB if kernel["StreamK"] != 0: - module.add(self.localReadAddresses(kernel, tensorParametersA, tensorParametersB, tPM)) - module.add(self.localWriteAddresses(kernel, tensorParametersA, tensorParametersB, tPM)) + if not kernel["UseSubtileImpl"]: + module.add(self.localReadAddresses(kernel, tensorParametersA, tensorParametersB, tPM)) + module.add(self.localWriteAddresses(kernel, tensorParametersA, tensorParametersB, tPM)) tdmA: bool = kernel["enableTDMA"] tdmB: bool = kernel["enableTDMB"] tdmInited: bool = False - #TODO: TDM wave separated - if tdmA and tdmB and prod(kernel["MIWaveGroup"]) > 1: - module.add(self.initTDMDescriptorWaveSeparated(kernel, tensorParametersA, tensorParametersB)) - if kernel["ProblemType"]["MXBlockA"] and kernel["ProblemType"]["MXBlockB"]: - module.add(self.initTDMDescriptorWaveSeparated(kernel, tensorParametersA["MX"], tensorParametersB["MX"])) - module.add(self.tdmGlobalOffsetWaveSeparated(kernel, tensorParametersA, tensorParametersB)) - if kernel["ProblemType"]["MXBlockA"] and kernel["ProblemType"]["MXBlockB"]: - module.add(self.tdmGlobalOffsetWaveSeparated(kernel, tensorParametersA["MX"], tensorParametersB["MX"])) - tdmInited = True - - # Tile offset assignment A(MXSA) - #TODO: TDM handles MXSA and MXSB - if tdmA: - if not tdmInited: - module.add(self.tdmGlobalOffset(kernel, tensorParametersA)) - module.add(self.initTDMDescriptor(kernel, tensorParametersA)) - else: + # TODO: This can probably be moved later, after setupnewtile + if not tdmA: module.add(self.removeGRSrdVariableSgprsFromPool(kernel)) - module.addComment1("global read addresses: tile offset assignment a") - module.add(self.graTileAssignment(kernel, tensorParametersA)) - if kernel["ProblemType"]["MXBlockA"]: - if not tdmA: - module.addComment1("global read addresses: tile offset assignment mxsa") - module.add(self.graTileAssignment(kernel, tensorParametersA["MX"])) - # Tile offset assignment Metadata - if kernel["ProblemType"]["Sparse"]: - module.addComment1("global read addresses: tile offset assignment metadata") - if kernel["DirectToVgprSparseMetadata"]: - # calculate tile assignment and store into each vgprGlobalReadOffsetMetadata - module.add(self.graMetadataTileAssignment(kernel, tPMRef)) + + # tile assignments + if not kernel["UseSubtileImpl"]: + #TODO: TDM wave separated + if tdmA and tdmB and prod(kernel["MIWaveGroup"]) > 1: + module.add(self.initTDMDescriptorWaveSeparated(kernel, tensorParametersA, tensorParametersB)) + if kernel["ProblemType"]["MXBlockA"] and kernel["ProblemType"]["MXBlockB"]: + module.add(self.initTDMDescriptorWaveSeparated(kernel, tensorParametersA["MX"], tensorParametersB["MX"])) + module.add(self.tdmGlobalOffsetWaveSeparated(kernel, tensorParametersA, tensorParametersB)) + if kernel["ProblemType"]["MXBlockA"] and kernel["ProblemType"]["MXBlockB"]: + module.add(self.tdmGlobalOffsetWaveSeparated(kernel, tensorParametersA["MX"], tensorParametersB["MX"])) + tdmInited = True + + # Tile offset assignment A(MXSA) + #TODO: TDM handles MXSA and MXSB + if tdmA: + if not tdmInited: + module.add(self.tdmGlobalOffset(kernel, tensorParametersA)) + module.add(self.initTDMDescriptor(kernel, tensorParametersA)) else: - module.add(self.graTileAssignment(kernel, tPM)) - # Tile offset assignment B(MXSB) - if kernel["ProblemType"]["MXBlockB"]: - if not tdmB: - module.addComment1("global read addresses: tile offset assignment mxsb") - module.add(self.graTileAssignment(kernel, tensorParametersB["MX"])) - if tdmB: - if not tdmInited: - module.add(self.tdmGlobalOffset(kernel, tensorParametersB)) - module.add(self.initTDMDescriptor(kernel, tensorParametersB)) - else: - module.addComment1("global read addresses: tile offset assignment b") - module.add(self.graTileAssignment(kernel, tensorParametersB)) + module.addComment1("global read addresses: tile offset assignment a") + module.add(self.graTileAssignment(kernel, tensorParametersA)) + if kernel["ProblemType"]["MXBlockA"]: + if not tdmA: + module.addComment1("global read addresses: tile offset assignment mxsa") + module.add(self.graTileAssignment(kernel, tensorParametersA["MX"])) + # Tile offset assignment Metadata + if kernel["ProblemType"]["Sparse"]: + module.addComment1("global read addresses: tile offset assignment metadata") + if kernel["DirectToVgprSparseMetadata"]: + # calculate tile assignment and store into each vgprGlobalReadOffsetMetadata + module.add(self.graMetadataTileAssignment(kernel, tPMRef)) + else: + module.add(self.graTileAssignment(kernel, tPM)) + # Tile offset assignment B(MXSB) + if kernel["ProblemType"]["MXBlockB"]: + if not tdmB: + module.addComment1("global read addresses: tile offset assignment mxsb") + module.add(self.graTileAssignment(kernel, tensorParametersB["MX"])) + if tdmB: + if not tdmInited: + module.add(self.tdmGlobalOffset(kernel, tensorParametersB)) + module.add(self.initTDMDescriptor(kernel, tensorParametersB)) + else: + module.addComment1("global read addresses: tile offset assignment b") + module.add(self.graTileAssignment(kernel, tensorParametersB)) - # Unroll assignment A(MXSA) - if not tdmA: - module.addComment1("global read addresses: unroll assignment a") - module.add(self.graUnrollAssignment(kernel, tensorParametersA)) - if kernel["ProblemType"]["MXBlockA"]: + # Unroll assignment A(MXSA) if not tdmA: - module.addComment1("global read addresses: unroll assignment mxsa") - module.add(self.graUnrollAssignment(kernel, tensorParametersA["MX"])) - # Unroll assignment Metadata - if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - module.addComment1("global read addresses: unroll assignment metadata") - module.add(self.graUnrollAssignment(kernel, tPM)) - # Unroll assignment B(MXSB) - if kernel["ProblemType"]["MXBlockB"]: + module.addComment1("global read addresses: unroll assignment a") + module.add(self.graUnrollAssignment(kernel, tensorParametersA)) + if kernel["ProblemType"]["MXBlockA"]: + if not tdmA: + module.addComment1("global read addresses: unroll assignment mxsa") + module.add(self.graUnrollAssignment(kernel, tensorParametersA["MX"])) + # Unroll assignment Metadata + if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: + module.addComment1("global read addresses: unroll assignment metadata") + module.add(self.graUnrollAssignment(kernel, tPM)) + # Unroll assignment B(MXSB) + if kernel["ProblemType"]["MXBlockB"]: + if not tdmB: + module.addComment1("global read addresses: unroll assignment mxsb") + module.add(self.graUnrollAssignment(kernel, tensorParametersB["MX"])) if not tdmB: - module.addComment1("global read addresses: unroll assignment mxsb") - module.add(self.graUnrollAssignment(kernel, tensorParametersB["MX"])) - if not tdmB: - module.addComment1("global read addresses: unroll assignment b") - module.add(self.graUnrollAssignment(kernel, tensorParametersB)) - - # other free indices - if not (tdmA or tdmB): - if kernel["ProblemType"]["NumIndicesC"] > 2: - module.addComment1("global read addresses: other free assignments") - module.add(self.graOtherFreeAssignments()) - - # other summation indices - if self.states.otherSummations: - module.addComment1("global read addresses: other summation assignments") - module.add(self.graOtherSummationAssignments(kernel)) - - # Tile offsets A(MXSA) - if not tdmA: - module.addComment1("global read addresses: tile offsets a") - module.add(self.graTileOffsets(kernel, tensorParametersA)) - if kernel["ProblemType"]["MXBlockA"]: - module.addComment1("global read addresses: tile offsets mxsa") + module.addComment1("global read addresses: unroll assignment b") + module.add(self.graUnrollAssignment(kernel, tensorParametersB)) + + # other free indices + if not (tdmA or tdmB): + if kernel["ProblemType"]["NumIndicesC"] > 2: + module.addComment1("global read addresses: other free assignments") + module.add(self.graOtherFreeAssignments()) + + # other summation indices + if self.states.otherSummations: + module.addComment1("global read addresses: other summation assignments") + module.add(self.graOtherSummationAssignments(kernel)) + + # Tile offsets A(MXSA) if not tdmA: - module.add(self.graTileOffsets(kernel, tensorParametersA["MX"])) - # Tile offsets Metadata - if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - module.addComment1("global read addresses: tile offsets metadata") - # Using A or B's margin to instead Metadata's margin - module.add(self.graTileOffsets(kernel, tPM, tPMRef["glvw"] if tPMRef["rtv"] else 1)) - # Tile offsets B(MXSB) - if kernel["ProblemType"]["MXBlockB"]: - module.addComment1("global read addresses: tile offsets mxsb") + module.addComment1("global read addresses: tile offsets a") + module.add(self.graTileOffsets(kernel, tensorParametersA)) + if kernel["ProblemType"]["MXBlockA"]: + module.addComment1("global read addresses: tile offsets mxsa") + if not tdmA: + module.add(self.graTileOffsets(kernel, tensorParametersA["MX"])) + # Tile offsets Metadata + if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: + module.addComment1("global read addresses: tile offsets metadata") + # Using A or B's margin to instead Metadata's margin + module.add(self.graTileOffsets(kernel, tPM, tPMRef["glvw"] if tPMRef["rtv"] else 1)) + # Tile offsets B(MXSB) + if kernel["ProblemType"]["MXBlockB"]: + module.addComment1("global read addresses: tile offsets mxsb") + if not tdmB: + module.add(self.graTileOffsets(kernel, tensorParametersB["MX"])) if not tdmB: - module.add(self.graTileOffsets(kernel, tensorParametersB["MX"])) - if not tdmB: - module.addComment1("global read addresses: tile offsets b") - module.add(self.graTileOffsets(kernel, tensorParametersB)) + module.addComment1("global read addresses: tile offsets b") + module.add(self.graTileOffsets(kernel, tensorParametersB)) - # Unroll offsets A(MXSA) - if not tdmA: - module.addComment1("global read addresses: unroll offsets a") - module.add(self.graUnrollOffsets(kernel, tensorParametersA)) - if kernel["ProblemType"]["MXBlockA"]: - module.addComment1("global read addresses: unroll offsets mxsa") + # Unroll offsets A(MXSA) if not tdmA: - module.add(self.graUnrollOffsets(kernel, tensorParametersA["MX"])) - # Unroll offsets Metadata - if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - module.addComment1("global read addresses: unroll offsets metadata") - module.add(self.graUnrollOffsets(kernel, tPM)) - # Unroll offsets B(MXSB) - if kernel["ProblemType"]["MXBlockB"]: - module.addComment1("global read addresses: unroll offsets mxsb") + module.addComment1("global read addresses: unroll offsets a") + module.add(self.graUnrollOffsets(kernel, tensorParametersA)) + if kernel["ProblemType"]["MXBlockA"]: + module.addComment1("global read addresses: unroll offsets mxsa") + if not tdmA: + module.add(self.graUnrollOffsets(kernel, tensorParametersA["MX"])) + # Unroll offsets Metadata + if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: + module.addComment1("global read addresses: unroll offsets metadata") + module.add(self.graUnrollOffsets(kernel, tPM)) + # Unroll offsets B(MXSB) + if kernel["ProblemType"]["MXBlockB"]: + module.addComment1("global read addresses: unroll offsets mxsb") + if not tdmB: + module.add(self.graUnrollOffsets(kernel, tensorParametersB["MX"])) + if not tdmB: - module.add(self.graUnrollOffsets(kernel, tensorParametersB["MX"])) - - if not tdmB: - module.addComment1("global read addresses: unroll offsets b") - module.add(self.graUnrollOffsets(kernel, tensorParametersB)) - - # tile edges - if kernel["EdgeType"] == "ShiftPtr" and not tdmA and not tdmB: - if self.states.useBias == DataDirection.WRITE and (kernel["ProblemType"]["BiasSrc"] == "A" or kernel["ProblemType"]["BiasSrc"] == "B"): - # Not supported - assert not forceNoTileCode - # Shift here has two purposes: - # 1. Ensure the loads are in-bounds to prevent fault. - # BufferLoad uses the buffer limit hardware and does not require bounds checking for this case - # 2. Shift-left a wide vector load to ensure it is completely in-bounds. - # If this occurs we need to 'unshift' the C values (see shiftVectorComponents) - # BufferLoad does support this shifting, but if GuaranteeNoPartial=1 then - # it can be guaranteed that no shifting is required. - if not (kernel["BufferLoad"] and kernel["GuaranteeNoPartialA"]) and not forceNoTileCode and not kernel["UseGeneralizedNLCOneA"] \ - and not tensorParametersA["isSwizzled"]: - module.addComment1("global read addresses: shift a") - module.add(self.graShift(kernel, tensorParametersA)) - if tensorParametersA["is_sparse"] and kernel["DirectToVgprSparseMetadata"]: - module.addComment1("global read addresses: shift metadata") - module.add(self.graMetadataShift(kernel, tensorParametersA)) - if kernel["ProblemType"]["MXBlockA"]: - module.addComment1("global read addresses: shift mxsa") - module.add(self.graShiftMX(kernel, tensorParametersA["MX"], tensorParametersA)) - - if not (kernel["BufferLoad"] and kernel["GuaranteeNoPartialMetadata"]) and not forceNoTileCode \ - and kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - module.addComment1("global read addresses: shift metadata") - # Using A's margin to instead Metadata's margin - module.add(self.graShift(kernel, tPM, tPMRef["glvw"] if tPMRef["rtv"] else 1)) - - if not (kernel["BufferLoad"] and kernel["GuaranteeNoPartialB"]) and not forceNoTileCode and not kernel["UseGeneralizedNLCOneB"] \ - and not tensorParametersB["isSwizzled"]: - module.addComment1("global read addresses: shift b") - module.add(self.graShift(kernel, tensorParametersB)) - if tensorParametersB["is_sparse"] and kernel["DirectToVgprSparseMetadata"]: + module.addComment1("global read addresses: unroll offsets b") + module.add(self.graUnrollOffsets(kernel, tensorParametersB)) + + # tile edges + if kernel["EdgeType"] == "ShiftPtr" and not tdmA and not tdmB: + if self.states.useBias == DataDirection.WRITE and (kernel["ProblemType"]["BiasSrc"] == "A" or kernel["ProblemType"]["BiasSrc"] == "B"): + # Not supported + assert not forceNoTileCode + # Shift here has two purposes: + # 1. Ensure the loads are in-bounds to prevent fault. + # BufferLoad uses the buffer limit hardware and does not require bounds checking for this case + # 2. Shift-left a wide vector load to ensure it is completely in-bounds. + # If this occurs we need to 'unshift' the C values (see shiftVectorComponents) + # BufferLoad does support this shifting, but if GuaranteeNoPartial=1 then + # it can be guaranteed that no shifting is required. + if not (kernel["BufferLoad"] and kernel["GuaranteeNoPartialA"]) and not forceNoTileCode and not kernel["UseGeneralizedNLCOneA"] \ + and not tensorParametersA["isSwizzled"]: + module.addComment1("global read addresses: shift a") + module.add(self.graShift(kernel, tensorParametersA)) + if tensorParametersA["is_sparse"] and kernel["DirectToVgprSparseMetadata"]: + module.addComment1("global read addresses: shift metadata") + module.add(self.graMetadataShift(kernel, tensorParametersA)) + if kernel["ProblemType"]["MXBlockA"]: + module.addComment1("global read addresses: shift mxsa") + module.add(self.graShiftMX(kernel, tensorParametersA["MX"], tensorParametersA)) + + if not (kernel["BufferLoad"] and kernel["GuaranteeNoPartialMetadata"]) and not forceNoTileCode \ + and kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: module.addComment1("global read addresses: shift metadata") - module.add(self.graMetadataShift(kernel, tensorParametersB)) - if kernel["ProblemType"]["MXBlockB"]: - module.addComment1("global read addresses: shift mxsb") - module.add(self.graShiftMX(kernel, tensorParametersB["MX"], tensorParametersB)) - - # addresses - def releaseTensorTmpGprs(tP): - self.vgprPool.checkIn(tP["gpr"]["lwoT"]) - tP["gpr"]["lwoT"] = None - self.vgprPool.checkIn(tP["gpr"]["uReg2"]) - tP["gpr"]["uReg2"] = None - - self.vgprPool.checkIn(tP["gpr"]["uReg"]) - tP["gpr"]["uReg"] = None - if "subIterReg" in tP["gpr"]: - if tP["gpr"]["subIterReg"] is not None: - self.vgprPool.checkIn(tP["gpr"]["subIterReg"]) - tP["gpr"]["subIterReg"] = None - - # addresses - if not forceNoTileCode: - # Addresses A(MXSA) + # Using A's margin to instead Metadata's margin + module.add(self.graShift(kernel, tPM, tPMRef["glvw"] if tPMRef["rtv"] else 1)) + + if not (kernel["BufferLoad"] and kernel["GuaranteeNoPartialB"]) and not forceNoTileCode and not kernel["UseGeneralizedNLCOneB"] \ + and not tensorParametersB["isSwizzled"]: + module.addComment1("global read addresses: shift b") + module.add(self.graShift(kernel, tensorParametersB)) + if tensorParametersB["is_sparse"] and kernel["DirectToVgprSparseMetadata"]: + module.addComment1("global read addresses: shift metadata") + module.add(self.graMetadataShift(kernel, tensorParametersB)) + if kernel["ProblemType"]["MXBlockB"]: + module.addComment1("global read addresses: shift mxsb") + module.add(self.graShiftMX(kernel, tensorParametersB["MX"], tensorParametersB)) + + # addresses + def releaseTensorTmpGprs(tP): + self.vgprPool.checkIn(tP["gpr"]["lwoT"]) + tP["gpr"]["lwoT"] = None + self.vgprPool.checkIn(tP["gpr"]["uReg2"]) + tP["gpr"]["uReg2"] = None + + self.vgprPool.checkIn(tP["gpr"]["uReg"]) + tP["gpr"]["uReg"] = None + if "subIterReg" in tP["gpr"]: + if tP["gpr"]["subIterReg"] is not None: + self.vgprPool.checkIn(tP["gpr"]["subIterReg"]) + tP["gpr"]["subIterReg"] = None + + # addresses + if not forceNoTileCode: + # Addresses A(MXSA) + if not tdmA: + module.addComment1("global read addresses: addresses a") + module.add(self.graAddresses(kernel, tensorParametersA)) + if not tdmA and kernel["ProblemType"]["MXBlockA"]: + module.addComment1("global read addresses: addresses mxsa") + module.add(self.graAddresses(kernel, tensorParametersA["MX"])) + # Addresses Metadata + if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: + module.addComment1("global read addresses: addresses metadata") + module.add(self.graAddresses(kernel, tPM)) + # Addresses B(MXSB) + if not tdmB and kernel["ProblemType"]["MXBlockB"]: + module.addComment1("global read addresses: addresses mxsb") + module.add(self.graAddresses(kernel, tensorParametersB["MX"])) + if not tdmB: + module.addComment1("global read addresses: addresses b") + module.add(self.graAddresses(kernel, tensorParametersB)) + + # workgroup SGPRs no longer needed + if not tdmA: + module.add(self.removeGROffsetsVariableSgprsFromPool(kernel)) + + # Final offsets A(MXSA) if not tdmA: - module.addComment1("global read addresses: addresses a") - module.add(self.graAddresses(kernel, tensorParametersA)) + module.addComment1("global read addresses: final offsets a") + module.add(self.graFinalOffsets(kernel, tensorParametersA)) + # releaseTensorTmpGprs(tensorParametersA) if not tdmA and kernel["ProblemType"]["MXBlockA"]: - module.addComment1("global read addresses: addresses mxsa") - module.add(self.graAddresses(kernel, tensorParametersA["MX"])) - # Addresses Metadata - if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - module.addComment1("global read addresses: addresses metadata") - module.add(self.graAddresses(kernel, tPM)) - # Addresses B(MXSB) + module.addComment1("global read addresses: final offsets mxsa") + module.add(self.graFinalOffsets(kernel, tensorParametersA["MX"])) + if kernel["ProblemType"]["Sparse"]: + module.addComment1("global read addresses: final offsets metadata") + if kernel["DirectToVgprSparseMetadata"]: + module.add(self.graMetadataFinalOffsets(kernel, tPMRef)) + else: + module.add(self.graFinalOffsets(kernel, tPM)) + # Final offsets B(MXSB) if not tdmB and kernel["ProblemType"]["MXBlockB"]: - module.addComment1("global read addresses: addresses mxsb") - module.add(self.graAddresses(kernel, tensorParametersB["MX"])) + module.addComment1("global read addresses: final offsets mxsb") + module.add(self.graFinalOffsets(kernel, tensorParametersB["MX"])) if not tdmB: - module.addComment1("global read addresses: addresses b") - module.add(self.graAddresses(kernel, tensorParametersB)) - - # workgroup SGPRs no longer needed - if not tdmA: - module.add(self.removeGROffsetsVariableSgprsFromPool(kernel)) - - # Final offsets A(MXSA) - if not tdmA: - module.addComment1("global read addresses: final offsets a") - module.add(self.graFinalOffsets(kernel, tensorParametersA)) - # releaseTensorTmpGprs(tensorParametersA) - if not tdmA and kernel["ProblemType"]["MXBlockA"]: - module.addComment1("global read addresses: final offsets mxsa") - module.add(self.graFinalOffsets(kernel, tensorParametersA["MX"])) - if kernel["ProblemType"]["Sparse"]: - module.addComment1("global read addresses: final offsets metadata") - if kernel["DirectToVgprSparseMetadata"]: - module.add(self.graMetadataFinalOffsets(kernel, tPMRef)) - else: - module.add(self.graFinalOffsets(kernel, tPM)) - # Final offsets B(MXSB) - if not tdmB and kernel["ProblemType"]["MXBlockB"]: - module.addComment1("global read addresses: final offsets mxsb") - module.add(self.graFinalOffsets(kernel, tensorParametersB["MX"])) - if not tdmB: - module.addComment1("global read addresses: final offsets b") - module.add(self.graFinalOffsets(kernel, tensorParametersB)) - # releaseTensorTmpGprs(tensorParametersB) - - self.dontAppendCode = False - self.dontAppendCode = self.dontAppendCode or forceNoTileCode + module.addComment1("global read addresses: final offsets b") + module.add(self.graFinalOffsets(kernel, tensorParametersB)) + # releaseTensorTmpGprs(tensorParametersB) - # Add increment code - gsuComponent = Component.GSU.find(self) - module.add(gsuComponent.setupNewTile(self, kernel, tensorParametersA, tensorParametersB, tPM)) + self.dontAppendCode = False + self.dontAppendCode = self.dontAppendCode or forceNoTileCode - #TODO: TDM wave separated - if tdmA and tdmB and prod(kernel["MIWaveGroup"]) > 1: - module.add(self.tdmSetupIncrementWaveSeparated(kernel, tensorParametersA, tensorParametersB)) + # Add increment code + gsuComponent = Component.GSU.find(self) + module.add(gsuComponent.setupNewTile(self, kernel, tensorParametersA, tensorParametersB, tPM)) - if kernel["ProblemType"]["MXBlockA"] and kernel["ProblemType"]["MXBlockB"]: - module.add(self.tdmSetupIncrementWaveSeparated(kernel, tensorParametersA["MX"], tensorParametersB["MX"])) + #TODO: TDM wave separated + if tdmA and tdmB and prod(kernel["MIWaveGroup"]) > 1: + module.add(self.tdmSetupIncrementWaveSeparated(kernel, tensorParametersA, tensorParametersB)) - if kernel["StreamK"] > 0: - module.add(self.tdmApplyStreamKOffsetWaveSeparated(kernel, tensorParametersA, tensorParametersB)) if kernel["ProblemType"]["MXBlockA"] and kernel["ProblemType"]["MXBlockB"]: - module.add(self.tdmApplyStreamKOffsetWaveSeparated(kernel, tensorParametersA["MX"], tensorParametersB["MX"])) + module.add(self.tdmSetupIncrementWaveSeparated(kernel, tensorParametersA["MX"], tensorParametersB["MX"])) + if kernel["StreamK"] > 0: + module.add(self.tdmApplyStreamKOffsetWaveSeparated(kernel, tensorParametersA, tensorParametersB)) + if kernel["ProblemType"]["MXBlockA"] and kernel["ProblemType"]["MXBlockB"]: + module.add(self.tdmApplyStreamKOffsetWaveSeparated(kernel, tensorParametersA["MX"], tensorParametersB["MX"])) - self.dontAppendCode = self.dontAppendCode or forceNoTileCode + ########################################################################### + # summations loops: open + ########################################################################### - ########################################################################### - # summations loops: open - ########################################################################### + # declare loop num iter + if not forceNoTileCode: + module.addComment0("declare loop num iterations") - # declare loop num iter - if not forceNoTileCode: - module.addComment0("declare loop num iterations") - - # perform initC in the shadow of the prefetch - # Prefetch occurs at start of unroll loop - # If we have multiple summation indices (otherSummationLoops>0), - # we can't init in shadow of this prefetch - # since that would initC inside the other summation loops + # perform initC in the shadow of the prefetch + # Prefetch occurs at start of unroll loop + # If we have multiple summation indices (otherSummationLoops>0), + # we can't init in shadow of this prefetch + # since that would initC inside the other summation loops - if self.states.doShadowInit != 2: - module.add(self.initC(kernel)) - if kernel["ProblemType"]["Gradient"] and kernel["ProblemType"]["UseBias"] and (kernel["ProblemType"]["BiasSrc"] == "A" or kernel["ProblemType"]["BiasSrc"] == "B"): - module.add(self.initSumUnroll(kernel)) + if self.states.doShadowInit != 2: + module.add(self.initC(kernel)) + if kernel["ProblemType"]["Gradient"] and kernel["ProblemType"]["UseBias"] and (kernel["ProblemType"]["BiasSrc"] == "A" or kernel["ProblemType"]["BiasSrc"] == "B"): + module.add(self.initSumUnroll(kernel)) + + # open non-unrolled summation loops + if not forceNoTileCode: + for i in range(kernel["ProblemType"]["NumIndicesSummation"]-1): + module.addComment1("summation loop %u"%i) + module.add(self.calculateLoopNumIter(kernel, tensorParametersA, tensorParametersB, i)) + if self.states.actualSummationLoops>1: + module.add(self.openLoop(kernel, tensorParametersA, tensorParametersB, i)) + module.add(self.calculateLoopNumIter(kernel, tensorParametersA, tensorParametersB, self.states.unrollIdx)) + + if not forceNoTileCode and self.states.staggerUCode: + module.add(self.declareStaggerParms(kernel)) + # Calculate stagger A(MXSA) + if not tdmA: + module.add(self.calculateStagger(kernel, tensorParametersA)) + if kernel["ProblemType"]["MXBlockA"]: + if not tdmA: + module.add(self.calculateStagger(kernel, tensorParametersA["MX"])) + if kernel["ProblemType"]["MXBlockB"]: + if not tdmB: + module.add(self.calculateStagger(kernel, tensorParametersB["MX"])) + # Calculate stagger Metadata + if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: + module.add(self.calculateStagger(kernel,tPM)) + # Calculate stagger B(MXSB) + if not tdmB: + module.add(self.calculateStagger(kernel, tensorParametersB)) - # open non-unrolled summation loops - if not forceNoTileCode: - for i in range(kernel["ProblemType"]["NumIndicesSummation"]-1): - module.addComment1("summation loop %u"%i) - module.add(self.calculateLoopNumIter(kernel, tensorParametersA, tensorParametersB, i)) - if self.states.actualSummationLoops>1: - module.add(self.openLoop(kernel, tensorParametersA, tensorParametersB, i)) - module.add(self.calculateLoopNumIter(kernel, tensorParametersA, tensorParametersB, self.states.unrollIdx)) - - if not forceNoTileCode and self.states.staggerUCode: - module.add(self.declareStaggerParms(kernel)) - # Calculate stagger A(MXSA) - if not tdmA: - module.add(self.calculateStagger(kernel, tensorParametersA)) + # LRO and LWA as assigned + # init lds read pointers before each unrolled loop + module.addComment0("local read addresses: init pointers a") + module.add(self.localReadInitPointers(kernel, tensorParametersA, tensorParametersA)) if kernel["ProblemType"]["MXBlockA"]: - if not tdmA: - module.add(self.calculateStagger(kernel, tensorParametersA["MX"])) + module.addComment0("local read addresses: init pointers mxsa") + module.add(self.localReadInitPointers(kernel, tensorParametersA, tensorParametersA["MX"])) if kernel["ProblemType"]["MXBlockB"]: - if not tdmB: - module.add(self.calculateStagger(kernel, tensorParametersB["MX"])) - # Calculate stagger Metadata + module.addComment0("local read addresses: init pointers mxsb") + module.add(self.localReadInitPointers(kernel, tensorParametersA, tensorParametersB["MX"])) if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - module.add(self.calculateStagger(kernel,tPM)) - # Calculate stagger B(MXSB) - if not tdmB: - module.add(self.calculateStagger(kernel, tensorParametersB)) + module.addComment0("local read addresses: init pointers metadata") + module.add(self.localReadInitPointers(kernel, tensorParametersA, tPM)) + module.addComment0("local read addresses: init pointers b") + module.add(self.localReadInitPointers(kernel, tensorParametersA, tensorParametersB)) + if self.states.IncLdsBufSwitch: + # IncLdsBufSwitch case, need to initialize local write inc register + module.addComment0("local write addresses: reset inc") + module.add(self.localWriteResetOffsets(kernel, False, tensorParametersA)) - # LRO and LWA as assigned - # init lds read pointers before each unrolled loop - module.addComment0("local read addresses: init pointers a") - module.add(self.localReadInitPointers(kernel, tensorParametersA, tensorParametersA)) - if kernel["ProblemType"]["MXBlockA"]: - module.addComment0("local read addresses: init pointers mxsa") - module.add(self.localReadInitPointers(kernel, tensorParametersA, tensorParametersA["MX"])) - if kernel["ProblemType"]["MXBlockB"]: - module.addComment0("local read addresses: init pointers mxsb") - module.add(self.localReadInitPointers(kernel, tensorParametersA, tensorParametersB["MX"])) - if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - module.addComment0("local read addresses: init pointers metadata") - module.add(self.localReadInitPointers(kernel, tensorParametersA, tPM)) - module.addComment0("local read addresses: init pointers b") - module.add(self.localReadInitPointers(kernel, tensorParametersA, tensorParametersB)) - if self.states.IncLdsBufSwitch: - # IncLdsBufSwitch case, need to initialize local write inc register - module.addComment0("local write addresses: reset inc") - module.add(self.localWriteResetOffsets(kernel, False, tensorParametersA)) - - if self.do["executeToInitEnd"]: - module.add(self.functionEnd(kernel, addLabel=False)) + if self.do["executeToInitEnd"]: + module.add(self.functionEnd(kernel, addLabel=False)) - #################################### - # prefetch: unrolled loop prefix - #################################### - if kernel["PrefetchGlobalRead"]: - # if DirectToVgpr is enabled and swapGlobalRead is true, swap the order of global read (B->A) - tensorParameters1st = tensorParametersA - tensorParameters2nd = tensorParametersB - tdm1st, tdm2nd = kernel["enableTDMA"], kernel["enableTDMB"] - if self.isSwapGlobalReadOrderForDtvOrDtl(kernel, prefetch1=True): - tensorParameters1st, tensorParameters2nd = tensorParameters2nd, tensorParameters1st - tdm1st, tdm2nd = tdm2nd, tdm1st - pfi = 1 if kernel["PrefetchGlobalRead"] < 3 else kernel["PrefetchGlobalRead"] - 1 - module.addComment1("prefetch: global -> local") - module.add(self.openSumAtLeastUnroll(kernel, prefetch=True, isOptNLL=isOptNLL)) - moduleTmp = self.directToLdsM0Update(kernel, 0, tensorParameters1st) - module.add(replaceHolder(moduleTmp, 0)) - module.add(self.globalReadDo(kernel, 0, tensorParameters1st)) - if "MX" in tensorParameters1st: - moduleTmp = self.directToLdsM0Update(kernel, 0, tensorParameters1st["MX"], skipWait=True) + #################################### + # prefetch: unrolled loop prefix + #################################### + if kernel["PrefetchGlobalRead"]: + # if DirectToVgpr is enabled and swapGlobalRead is true, swap the order of global read (B->A) + tensorParameters1st = tensorParametersA + tensorParameters2nd = tensorParametersB + tdm1st, tdm2nd = kernel["enableTDMA"], kernel["enableTDMB"] + if self.isSwapGlobalReadOrderForDtvOrDtl(kernel, prefetch1=True): + tensorParameters1st, tensorParameters2nd = tensorParameters2nd, tensorParameters1st + tdm1st, tdm2nd = tdm2nd, tdm1st + pfi = 1 if kernel["PrefetchGlobalRead"] < 3 else kernel["PrefetchGlobalRead"] - 1 + module.addComment1("prefetch: global -> local") + module.add(self.openSumAtLeastUnroll(kernel, prefetch=True, isOptNLL=isOptNLL)) + moduleTmp = self.directToLdsM0Update(kernel, 0, tensorParameters1st) module.add(replaceHolder(moduleTmp, 0)) - module.add(self.globalReadDo(kernel, 0, tensorParameters1st["MX"])) - if "MX" in tensorParameters2nd: - moduleTmp = self.directToLdsM0Update(kernel, 0, tensorParameters2nd["MX"], skipWait=True) + module.add(self.globalReadDo(kernel, 0, tensorParameters1st)) + if "MX" in tensorParameters1st: + moduleTmp = self.directToLdsM0Update(kernel, 0, tensorParameters1st["MX"], skipWait=True) + module.add(replaceHolder(moduleTmp, 0)) + module.add(self.globalReadDo(kernel, 0, tensorParameters1st["MX"])) + if "MX" in tensorParameters2nd: + moduleTmp = self.directToLdsM0Update(kernel, 0, tensorParameters2nd["MX"], skipWait=True) + module.add(replaceHolder(moduleTmp, 0)) + module.add(self.globalReadDo(kernel, 0, tensorParameters2nd["MX"])) + skip2ndWaitForDtl = kernel["DirectToLds%s"%tensorParameters1st["tensorChar"]] + moduleTmp = self.directToLdsM0Update(kernel, 0, tensorParameters2nd, skip2ndWaitForDtl) module.add(replaceHolder(moduleTmp, 0)) - module.add(self.globalReadDo(kernel, 0, tensorParameters2nd["MX"])) - skip2ndWaitForDtl = kernel["DirectToLds%s"%tensorParameters1st["tensorChar"]] - moduleTmp = self.directToLdsM0Update(kernel, 0, tensorParameters2nd, skip2ndWaitForDtl) - module.add(replaceHolder(moduleTmp, 0)) - module.add(self.globalReadDo(kernel, 0, tensorParameters2nd)) - tPA = tensorParametersA - tPB = tensorParametersB - if kernel["PrefetchGlobalRead"] == 2: - # PGR2 + DTV case, skip GR inc - if kernel["DirectToVgprA"]: - tPA = None - if kernel["DirectToVgprB"]: - tPB = None - module.add(self.globalReadIncrementAB(kernel, tPA, tPB, self.states.unrollIdx, pfi)) - # swap Tensor memToken - self.states.ldsTensorTokenIdx = \ - self.states.memTokenLdsBuffer1 if self.states.ldsTensorTokenIdx == self.states.memTokenLdsBuffer0 else self.states.memTokenLdsBuffer0 - + module.add(self.globalReadDo(kernel, 0, tensorParameters2nd)) + tPA = tensorParametersA + tPB = tensorParametersB + if kernel["PrefetchGlobalRead"] == 2: + # PGR2 + DTV case, skip GR inc + if kernel["DirectToVgprA"]: + tPA = None + if kernel["DirectToVgprB"]: + tPB = None + module.add(self.globalReadIncrementAB(kernel, tPA, tPB, self.states.unrollIdx, pfi)) + # swap Tensor memToken + self.states.ldsTensorTokenIdx = \ + self.states.memTokenLdsBuffer1 if self.states.ldsTensorTokenIdx == self.states.memTokenLdsBuffer0 else self.states.memTokenLdsBuffer0 module.addComment2("End setupNewTile") @@ -4176,6 +4194,335 @@ def createNegIdentityMatrix(self, kernel): self.vgprPool.checkIn(lane4) return module + + ############################################################################## + # Kernel Body - Subtiled version + ############################################################################## + def kernelBodySubtile(self, kernel, tensorParametersA, tensorParametersB): + #expand = kernel["ExpandPointerSwap"] + self.dontAppendCode = False + + tPM = tensorParametersA["tpsMetadata"] if tensorParametersA["is_sparse"] else tensorParametersB["tpsMetadata"] + + #################################### + # Begin String + moduleKernelBody = KernelBody("kernelBody") + + #################################### + # Function Signature + #################################### + fs = self.functionSignature() + moduleKernelBody.addSignature(fs) + + module = Module("body") + module.add(Label("ASM_Start", "Main body of the asm kernel")) + module.add(self.defineAndResources(kernel, tensorParametersA, tensorParametersB, tPM)) + + # Initialize stream-k loop + skComponent = Component.StreamK.find(self) + module.add(skComponent.preLoop(self, kernel)) + + # Should check for is swizzled instead of usesubtileimpl + # TODO: Move this calculation to host-side? + if kernel["ProblemType"]["MXBlockA"] and kernel["ProblemType"]["MXBlockA"] and kernel["UseSubtileImpl"]: + module.addComment("Scale StridesMXSA by 32") + module.add(SLShiftLeftB32(sgpr("StridesMXSA"), 5, sgpr("StridesMXSA"))) + module.add(SLShiftLeftB32(sgpr("StridesMXSB"), 5, sgpr("StridesMXSB"))) + + # Open persistent loop + loopComponent = Component.PersistentLoop.find(self) + + module.add(loopComponent.openPersistentLoop(self, kernel)) + + module.addComment0("Number of subtiles for A: %u"%(len(self.states.a.tileInfo.localSubtiles))) + module.addComment0("Number of subtiles for B: %u"%(len(self.states.b.tileInfo.localSubtiles))) + + module.add(self.setupNewTile(kernel, tensorParametersA, tensorParametersB, isOptNLL=False)) + module.add(self.removeGROffsetsVariableSgprsFromPool(kernel)) + #self.removeSgprVarFromPool("SrdD") + #self.removeSgprVarFromPool("SrdC") + + atileInfo = self.states.a.tileInfo + btileInfo = self.states.b.tileInfo + # TODO: Need corresponding ctileInfo for GSU/StreamK + dtileInfo = self.states.d.tileInfo + mxsatileInfo = self.states.mxsa.tileInfo if kernel["ProblemType"].get("MXBlockA", 0) else None + mxsbtileInfo = self.states.mxsb.tileInfo if kernel["ProblemType"].get("MXBlockB", 0) else None + + ## + # TODOBS: need to add init c code, and also init sum unroll code. + # + + module.add(globalReadDTLInitCommonSgpr(self, kernel)) + + if mxsatileInfo != None and mxsbtileInfo != None: + module.add(globalReadScaleSwizzledDTLInitCommonSgpr(self, kernel)) + + # TODOBS: globalWriteWorkGroupInit can be emitted here or later on, check.. + if self.states.doShadowInit: + #module.add(self.openShadowInit()) + # SrdD/SrdC are used starting now, remove from sgpr pool + self.removeSgprVarFromPool("SrdD") + self.removeSgprVarFromPool("SrdC") + self.removeSgprVarFromPool("SrdWS") + module.add(self.globalWriteWorkGroupInit(kernel)) + #if self.states.doShadowInit == 2: + # module.add(self.initC(kernel)) # initC while waiting for global reads + # if kernel["ProblemType"]["Gradient"] and kernel["ProblemType"]["UseBias"] and (kernel["ProblemType"]["BiasSrc"] == "A" or kernel["ProblemType"]["BiasSrc"] == "B"): + # module.add(self.initSumUnroll(kernel)) + #module.add(self.closeShadowInit(kernel)) + + + module.addComment1("global read addresses: addresses a") + module.add(self.graAddresses(kernel, tensorParametersA)) + if kernel["ProblemType"]["MXBlockA"]: + module.addComment1("global read addresses: addresses mxsa") + module.add(self.graAddresses(kernel, tensorParametersA["MX"])) + module.addComment1("global read addresses: addresses b") + module.add(self.graAddresses(kernel, tensorParametersB)) + if kernel["ProblemType"]["MXBlockB"]: + module.addComment1("global read addresses: addresses mxsb") + module.add(self.graAddresses(kernel, tensorParametersB["MX"])) + + + + # List of tiles that need to be read form + readtileInfoList = [atileInfo, btileInfo, mxsatileInfo, mxsbtileInfo] + + # Printout tile info + for tileInfo in [atileInfo, btileInfo, mxsatileInfo, mxsbtileInfo, dtileInfo]: + if tileInfo != None: + module.addComment0(str(tileInfo)) + + # Allocate registers for GR/LR + for tileInfo in [atileInfo, btileInfo, mxsatileInfo, mxsbtileInfo]: + if tileInfo != None: + tileInfo.allocOffsetRegisters(self, kernel) + module.addComment("Allocating v%s for %s GR"%(str(tileInfo.sharedVgprGROffset), tileInfo.tc)) + module.addComment("Allocating v%s for %s LR"%(str(tileInfo.sharedVgprLROffset), tileInfo.tc)) + module.addComment("Allocating v%s for %s LR Swap"%(str(tileInfo.sharedVgprLROffsetSwap), tileInfo.tc)) + + for tileInfo in [atileInfo, btileInfo, mxsatileInfo, mxsbtileInfo]: + if tileInfo != None: + for st in tileInfo.localSubtiles: + # Print out, only if register is allocated + if len(tileInfo.localSubtilesRegister): + linearId = tileInfo.localSubtiles.index(st) + sId0, sId1 = tileInfo.getLocalSubtileIdFromLinearId(linearId) + regstr = 's' if st.useSgpr else 'v' + module.addComment0("Using %s%s for %s GR, subtile: [%u, %u]"%(\ + tileInfo.tc, \ + regstr, str(tileInfo.localSubtilesRegister[st.regListId]), sId0, sId1)) + + + module.add(graTileAssignment(self, kernel)) + module.add(lraTileAssignment(self, kernel)) + + module.add(localReadDTLInitCommonSwapVgpr(self, kernel)) + + module.add(graTileAssignmentScaleSwizzled(self, kernel)) + module.add(lraTileAssignmentScaleSwizzled(self, kernel)) + + + module.add(self.calculateLoopNumIter(kernel, tensorParametersA, tensorParametersB, self.states.unrollIdx)) + + # Allocate registers for VGPR tiles + pgr = kernel["PrefetchGlobalRead"] + if pgr != 2: + # PGR=2: A/B vgprTiles are allocated by SubtileBasedLogicalScheduler in mainLoop + # TMP HACK to still use legacy path for PGR=0 + for tileInfo in [atileInfo, btileInfo]: + tileInfo.allocVgprTileRegisters(self, kernel) + + for tileInfo in [mxsatileInfo, mxsbtileInfo]: + if tileInfo: + tileInfo.allocVgprTileRegisters(self, kernel) + + for tileInfo in [dtileInfo]: + tileInfo.allocVgprTileRegisters(self, kernel) + + + + module.add(initVgprTilesToZero(self, kernel, dtileInfo)) + + if pgr != 2: + self.states.scheduleInfo = ScheduleInfo(atileInfo, btileInfo) + for tileInfo in [atileInfo, btileInfo, mxsatileInfo, mxsbtileInfo]: + if tileInfo: + for vtiles in tileInfo.vgprTiles: + regStr = "Vgpr" if vtiles.regList.regPool == self.vgprPool else "Agpr" # shouldn't this only be vgpr pool? + module.addComment("%ss used for %s mma tile %u: %s"%(regStr, tileInfo.tc, tileInfo.vgprTiles.index(vtiles), str(vtiles))) + + for tileInfo in [dtileInfo]: + if tileInfo: + for vtiles in tileInfo.vgprTiles: + regStr = "Vgpr" if vtiles.regList.regPool == self.vgprPool else "Agpr" # shouldn't this only be vgpr pool? + module.addComment("%ss used for %s mma tile %u: %s"%(regStr, tileInfo.tc, tileInfo.vgprTiles.index(vtiles), str(vtiles))) + + + vtmp = self.vgprPool.checkOut(1) + module.addComment("Checking out %u"%vtmp) + self.vgprPool.checkIn(vtmp) + + if self.do["executeToPrefetchEnd"]: + module.add(self.functionEnd(kernel, addLabel=False)) + + #module.add(preLoop(self, kernel)) + module.add(mainLoop(self, kernel)) + + # Deallocate registers used for GR/LR offsets + for tileInfo in [atileInfo, btileInfo, mxsatileInfo, mxsbtileInfo]: + if tileInfo != None: + tileInfo.deallocOffsetRegisters(self, kernel) + + # For subtile kernels, free SGPRs that were only needed during the main loop + if kernel["UseSubtileImpl"]: + module.add(self.undefineSubtileMainLoopSgprs(kernel)) + # Remove SrdWS from the free pool before defineSgprIdx calls below, + # otherwise checkOutAligned can grab registers overlapping SrdWS. + if not self.states.doShadowInit and kernel["StreamK"] and kernel["StreamKAtomic"] == 0: + self.removeSgprVarFromPool("SrdWS") + # Immediately allocate permanent SGPRs for subtile M/N guards. + # Must be done here (before endSummation) so they get indices from + # the freshly-freed swap/LocalWriteBaseAddr range. + self.states.subtileM32ValidBlocksSgpr = self.defineSgprIdx("SubtileMGuard", 1) + self.states.subtileN16ValidBlocksSgpr = self.defineSgprIdx("SubtileNGuard", 1) + module.add(RegSet("s", "sgprSubtileMGuard", self.states.subtileM32ValidBlocksSgpr)) + module.add(RegSet("s", "sgprSubtileNGuard", self.states.subtileN16ValidBlocksSgpr)) + self.states.nonPostLoopSgpr.append("SubtileMGuard") + self.states.nonPostLoopSgpr.append("SubtileNGuard") + + # Deallocate registers used for VGPR A/B/MXS tiles + if pgr != 2: + for tileInfo in [atileInfo, btileInfo,mxsatileInfo, mxsbtileInfo]: + if tileInfo: + tileInfo.deallocVgprTileRegisters(self, kernel) + + # Start of post-loop code + if 1: + module.addComment0(" =============================================================== ") + module.addComment0(" =================== Start of post-loop code =================== ") + module.addComment0(" =============================================================== ") + + self.states.c.startVgprValu = self.vgprPool.checkOutAligned(1, 4) + + module.addComment0("ValuC range: [%u-%u), %s"%(self.states.c.startVgprValu, self.states.c.startVgprValu+self.states.c.numVgprValu, \ + "serializedStore enabled" if self.states.serializedStore else "")) + module.add(RegSet("v", "vgprValuC", self.states.c.startVgprValu)) + self.states.serializedStore = True + + + # SrdWS must be removed from the free pool before endSummation so that + # defineSgpr() calls within endSummation don't see it as Available while + # defineMultiSgprIndex (checkOutMulti) may have grabbed those registers. + if not self.states.doShadowInit: + self.removeSgprVarFromPool("SrdWS") + module.add(self.endSummation(kernel, tensorParametersA, tensorParametersB)) + if not self.states.doShadowInit: + self.removeSgprVarFromPool("SrdD") + self.removeSgprVarFromPool("SrdC") + module.add(self.globalWriteWorkGroupInit(kernel)) + + #################################### + # NOT LocalSplitU + #################################### + + + + # global write indices + module.addComment1("not-LocalSplitU: global write indices") + module.add(self.notLocalSplitUGlobalWriteIndices(kernel)) + + # global write + #module.addComment1("not-LocalSplitU: global write") + storeModule, deferredGSU0 = self.notLocalSplitUGlobalWrite(kernel, tensorParametersA, tensorParametersB) + module.add(storeModule) + + self.vgprPool.checkIn(self.states.c.startVgprValu) + + # Deallocate registers used for C/D tiles after store code instructions are emitted + dtileInfo.deallocVgprTileRegisters(self, kernel) + + hasDeferredGSU0 = deferredGSU0 and len(deferredGSU0.items()) > 0 + hasDeferredFixup = hasattr(self.states, 'deferredFixupModule') and self.states.deferredFixupModule is not None + hasDeferredEdge = hasattr(self.states, 'deferredEdgeModules') and self.states.deferredEdgeModules + hasDeferredPartials = hasattr(self.states, 'deferredPartialsModule') and self.states.deferredPartialsModule is not None + hasDeferredActivation = hasattr(self.states, 'deferredActivationModules') and self.states.deferredActivationModules is not None + hasAnyDeferred = hasDeferredGSU0 or hasDeferredFixup or hasDeferredEdge or hasDeferredPartials + + if hasAnyDeferred: + loopComponent = Component.PersistentLoop.find(self) + module.add(loopComponent.closePersistentLoop(self, kernel)) + # After persistent loop exits, skip over all deferred blocks + kernelEndLabel = Label("KernelEnd", "") + with self.allocTmpSgpr(3) as tmpSgprInfo: + module.add(SLongBranchPositive(kernelEndLabel, tmpSgprInfo, comment="persistent loop done, skip deferred blocks")) + module.addComment0("#" * 60) + module.addComment0("#" * 60) + module.addComment0("##") + module.addComment0("## DEFERRED BLOCKS START") + module.addComment0("## The following code blocks have been moved here from their") + module.addComment0("## original inline positions to keep the optimized NonEdge") + module.addComment0("## beta=0 store path close to the main loop.") + module.addComment0("## Each block is reached via unconditional branch from its") + module.addComment0("## original label stub and returns via branch-back.") + module.addComment0("##") + module.addComment0("#" * 60) + module.addComment0("#" * 60) + if hasDeferredFixup: + module.appendModule(self.states.deferredFixupModule) + self.states.deferredFixupModule = None + if hasDeferredEdge: + for edgeMod in self.states.deferredEdgeModules: + module.appendModule(edgeMod) + self.states.deferredEdgeModules = [] + if hasDeferredPartials: + module.appendModule(self.states.deferredPartialsModule) + self.states.deferredPartialsModule = None + if hasDeferredGSU0: + module.appendModule(deferredGSU0) + if hasDeferredActivation: + module.appendModule(self.states.deferredActivationModules) + self.states.deferredActivationModules = None + module.add(kernelEndLabel) + if kernel["ProblemType"]["OutputAmaxD"]: + module.add(self.insertAmaxD(kernel)) + module.add(SEndpgm(comment="Kernel End")) + else: + # If activation was deferred but no other deferred blocks exist, emit it before functionEnd + if hasDeferredActivation: + module.appendModule(self.states.deferredActivationModules) + self.states.deferredActivationModules = None + module.add(self.functionEnd(kernel, addLabel=True)) + + # Add a label at the end of the asm for indexing. + module.add(Label("ASM_End", "The end of the kernel")) + + moduleKernelBody.addBody(module) + self.checkResources(kernel, moduleKernelBody) # check resource available or not + + + # TODO: Check what does this do and enable this if needed + # Tensile instruction pass, temporarily disable due to build time. + # Kernels with epilog especially with activation is too long (50000~ lines). + # Need to refactor global write elements. + #ripo = rocIsaPassOption() + #ripo.removeDupFunc = bool(kernel["ActivationFuncCall"]) + #ripo.numWaves = kernel["NumThreads"] // kernel["WavefrontSize"] + #if kernel["ProblemType"]["ActivationType"] == "all": + # ripo.removeDupAssign = False + #if self.states.archCaps["HasSchedMode"]: + # ripo.insertDelayAlu = True + #passResult = rocIsaPass(moduleKernelBody, ripo) + #kernel["MathClocksUnrolledLoop"] = passResult.cycles + + + error = self.states.overflowedResources + print2(f" found error code {error} with overflowed resources set to {self.states.overflowedResources}") + + return (error, str(moduleKernelBody)) + + ############################################################################## # StreamK Constants In VGPRs ############################################################################## @@ -5360,7 +5707,13 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): # global write module.addComment1("not-LocalSplitU: global write") - module.add(self.notLocalSplitUGlobalWrite(kernel, tensorParametersA, tensorParametersB)) + storeModule, _ = self.notLocalSplitUGlobalWrite(kernel, tensorParametersA, tensorParametersB) + module.add(storeModule) + + # Emit any deferred activation modules (set during globalWriteElements) + if hasattr(self.states, 'deferredActivationModules') and self.states.deferredActivationModules is not None: + module.appendModule(self.states.deferredActivationModules) + self.states.deferredActivationModules = None module.add(self.functionEnd(kernel, addLabel=True)) @@ -5535,6 +5888,65 @@ def _initKernel(self, kernel, tensorParametersA, tensorParametersB): self.asmAssert = Assert(self.states.laneSGPRCount, kernel["WavefrontSize"], self.db["EnableAsserts"]) + + def initSubTileInfo(tc): + tileMap = { + 'A' : self.states.a, + 'B' : self.states.b, + 'D' : self.states.d, + 'MXSA' : self.states.mxsa, + 'MXSB' : self.states.mxsb, + } + matrixInfo = tileMap[tc] + matrixInfo.tileInfo = TileInfo(tc, kernel) + tileInfo = matrixInfo.tileInfo + #print(tileInfo) + + + if kernel["UseSubtileImpl"]: + initSubTileInfo('A') + initSubTileInfo('B') + initSubTileInfo('D') + + if kernel["ProblemType"].get("MXBlockA", 0) > 0: + initSubTileInfo('MXSA') + if kernel["ProblemType"].get("MXBlockB", 0) > 0: + initSubTileInfo('MXSB') + + self.ldsStartOffsetA = 0 + aTileInfo = self.states.a.tileInfo + bTileInfo = self.states.b.tileInfo + numASubtiles = aTileInfo.globalSubtileGrid[0] * aTileInfo.globalSubtileGrid[1] + numBSubtiles = bTileInfo.globalSubtileGrid[0] * bTileInfo.globalSubtileGrid[1] + readSize = 2*aTileInfo.subtileSize + # Align A and B sizes to readSize for DTL 2xsubtile reads + sizeA = ((numASubtiles * aTileInfo.subtileSize + readSize-1) // readSize) * readSize + sizeB = ((numBSubtiles * bTileInfo.subtileSize + readSize-1) // readSize) * readSize + self.ldsStartOffsetB = sizeA + sizeMXSA = 0 + sizeMXSB = 0 + if kernel["ProblemType"].get("MXBlockA", 0) > 0 and kernel["ProblemType"].get("MXBlockB", 0) > 0: + mxsaTileInfo = self.states.mxsa.tileInfo + mxsbTileInfo = self.states.mxsb.tileInfo + + # For Swizzled scale we use extra LDS space for now to allow wider DTL loads + numWaves = kernel["MIWaveGroup"][0] * kernel["MIWaveGroup"][1] + sizeMXSA = mxsaTileInfo.loadWidthGR * kernel["WavefrontSize"] * numWaves + sizeMXSB = mxsbTileInfo.loadWidthGR * kernel["WavefrontSize"] * numWaves + self.ldsStartOffsetMXSA = sizeA + sizeB + self.ldsStartOffsetMXSB = sizeA + sizeB + sizeMXSA + + self.ldsTotalSize = sizeA + sizeB + sizeMXSA + sizeMXSB + + kernel["LdsNumBytes"] = max(1, int(self.ldsTotalSize * kernel["NumLdsBlk"])) + if kernel["LdsNumBytes"] > self.states.archCaps["DeviceLDS"]: + self.states.overflowedResources = 8 + + + #print(self.states.a.tileInfo.getLocalSubtileId(1,0)) + + #exit(1) + self.states.tailloopInNll = kernel["TailloopInNll"] # remove staggerU code for the following cases # - tailloopInNll (cannot support staggerU) @@ -5543,7 +5955,7 @@ def _initKernel(self, kernel, tensorParametersA, tensorParametersB): if self.states.tailloopInNll or \ (kernel["StreamK"] and \ (kernel["ProblemType"]["MXBlockA"] or kernel["ProblemType"]["MXBlockB"]) and \ - isgfx950): + isgfx950) or kernel["UseSubtileImpl"]: self.states.staggerUCode = False self.states.tailloopInNllmaxUnit = 1 if self.states.tailloopInNll: @@ -5584,7 +5996,8 @@ def _initKernel(self, kernel, tensorParametersA, tensorParametersB): self.states.scheduleGROverBarrier = kernel["ScheduleGROverBarrier"] # doShadowInit performs initialization in the 'shadow' of the global mem prefetch - if not kernel["ForceDisableShadowInit"]: + # TODO re-enable.. + if not kernel["ForceDisableShadowInit"] and not kernel["UseSubtileImpl"]: if kernel["PrefetchGlobalRead"]: if self.states.actualSummationLoops == 1: self.states.doShadowInit = 2 # 2 is both store setup and initC @@ -5986,14 +6399,14 @@ def readWriteVectors(mat, vw, kernel): # Gives pointer shift some room to move left, even into the previous macro-tile # This slightly reduces the range of the GRO since they have to include the offset # Pointer shift still cannot be used with very small matrices < GRVW - self.states.srdShiftLeft["A"] = kernel["GlobalReadVectorWidthA"] - self.states.srdShiftLeft["B"] = kernel["GlobalReadVectorWidthB"] + self.states.srdShiftLeft["A"] = kernel["GlobalReadVectorWidthA"] if not kernel["UseSubtileImpl"] else 0 + self.states.srdShiftLeft["B"] = kernel["GlobalReadVectorWidthB"] if not kernel["UseSubtileImpl"] else 0 if kernel["ProblemType"]["MXBlockA"]: # use MXS version for gfx950 only - self.states.srdShiftLeft["MXSA"] = kernel["GlobalReadVectorWidthMXSA"] if isgfx950 else kernel["GlobalReadVectorWidthA"] + self.states.srdShiftLeft["MXSA"] = kernel["GlobalReadVectorWidthMXSA"] if isgfx950 and not kernel["UseSubtileImpl"] else kernel["GlobalReadVectorWidthA"] if kernel["ProblemType"]["MXBlockB"]: # use MXS version for gfx950 only - self.states.srdShiftLeft["MXSB"] = kernel["GlobalReadVectorWidthMXSB"] if isgfx950 else kernel["GlobalReadVectorWidthB"] + self.states.srdShiftLeft["MXSB"] = kernel["GlobalReadVectorWidthMXSB"] if isgfx950 and not kernel["UseSubtileImpl"] else kernel["GlobalReadVectorWidthB"] if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: self.states.srdShiftLeft["Metadata"] = kernel["GlobalReadVectorWidthMetadata"] @@ -6226,659 +6639,912 @@ def readWriteVectors(mat, vw, kernel): if tensorParametersM is not None: tensorParametersM["nrcvpi"] = int((tensorParametersM["globalReadInstruction"].totalWidth*self.states.bpr)/tensorParametersM["bpeDS"]) tensorParametersM["nwcvpi"] = int((tensorParametersM["localWriteInstruction"].totalWidth*self.states.bpr)/tensorParametersM["bpeDS"]) + #################################### # VGPR Allocation #################################### - #################################### - # num vgprs: valu - if kernel["EnableMatrixInstruction"]: - #jgolds bpeCinternal because we are allocating accumulation registers here - self.states.c.numVgprValu = (kernel["ThreadTile0"]*kernel["ThreadTile1"]*self.states.bpeCinternal)//self.states.bpr - - # pack or input conversion DTV case, need double buffer (LoopIters * 2) - numVgprBufferA = self.states.numVgprBuffer if not (self.states.packDTVA or self.states.convDTVA) else kernel["LoopIters"] * 2 - numVgprBufferB = self.states.numVgprBuffer if not (self.states.packDTVB or self.states.convDTVB) else kernel["LoopIters"] * 2 - valuBlocks = self.states.numVgprBuffer * kernel["InnerUnroll"] # for Sparse - valuBlocksA = numVgprBufferA * kernel["InnerUnroll"] - valuBlocksB = numVgprBufferB * kernel["InnerUnroll"] - - self.states.a.numVgprValuPerBlock = int(kernel["MIWaveTileA"] * kernel["MIInputPerThreadA"] * tensorParametersA["bpe"] // self.states.bpr) - self.states.b.numVgprValuPerBlock = int(kernel["MIWaveTileB"] * kernel["MIInputPerThreadB"] * tensorParametersB["bpe"] // self.states.bpr) - - #TODO: remove this if upcoming compiler changes applied - if kernel["ProblemType"]["MacDataTypeA"].numBytes() == 0.75 or kernel["ProblemType"]["MacDataTypeB"].numBytes() == 0.75: - if kernel["enableLDSTrA"]: - numVgprPerSubIter = int(kernel["MIInputPerThreadA"] * tensorParametersA["bpe"] // self.states.bpr) - numVgprPerLocalRead, vgprAlignment = 3, 4 - numLoads = numVgprPerSubIter // numVgprPerLocalRead - self.states.a.numVgprValuPerBlock = kernel["MIWaveTileA"] * numLoads * vgprAlignment - - if kernel["enableLDSTrB"]: - numVgprPerSubIter = int(kernel["MIInputPerThreadB"] * tensorParametersB["bpe"] // self.states.bpr) - numVgprPerLocalRead, vgprAlignment = 3, 4 - numLoads = numVgprPerSubIter // numVgprPerLocalRead - self.states.b.numVgprValuPerBlock = kernel["MIWaveTileB"] * numLoads * vgprAlignment - - # change numVgprValuAPerBlock to 0 if DirectToVgpr is enabled (except for DTV + (pack or input conversion)) - if kernel["DirectToVgprA"] and not (self.states.packDTVA or self.states.convDTVA): - self.states.a.numVgprValuPerBlock = 0 - if kernel["DirectToVgprB"] and not (self.states.packDTVB or self.states.convDTVB): - self.states.b.numVgprValuPerBlock = 0 - - self.states.a.numVgprValu = self.states.a.numVgprValuPerBlock * valuBlocksA - if self.states.lrvwTileA > 1 and tensorParametersA["bpe"] < 4 and not (kernel["UsePLRPack"] and self.states.numItersPLR): - self.states.a.numVgprValu = self.states.a.numVgprValuPerBlock * kernel["InnerUnroll"] - - self.states.b.numVgprValu = self.states.b.numVgprValuPerBlock * valuBlocksB - if self.states.lrvwTileB > 1 and tensorParametersB["bpe"] < 4 and not (kernel["UsePLRPack"] and self.states.numItersPLR): - self.states.b.numVgprValu = self.states.b.numVgprValuPerBlock * kernel["InnerUnroll"] - - if kernel["ProblemType"]["MXBlockA"]: - self.states.mxsa.numVgprValuPerBlock = kernel["MIWaveTileMXSA"] * kernel["MIInputPerThreadMXSA"] // self.states.bpr - # workaround for gfx950 - # need to allocate same amount of MIWaveTile - if isgfx950: - self.states.mxsa.numVgprValuPerBlock = kernel["MIWaveTileMXSA"] - if kernel["DirectToVgprMXSA"] and not (self.states.packDTVA or self.states.convDTVA): - self.states.mxsa.numVgprValuPerBlock = 0 - # MX scale registers are consumed by local-read and wmma paths; avoid - # emitting unresolved vgprValuMXSA symbols when integer division rounds to 0. - elif self.states.mxsa.numVgprValuPerBlock == 0: - self.states.mxsa.numVgprValuPerBlock = kernel["MIWaveTileMXSA"] - self.states.mxsa.numVgprValu = self.states.mxsa.numVgprValuPerBlock * valuBlocksA - if self.states.lrvwTileMXSA > 1: - self.states.mxsa.numVgprValu = self.states.mxsa.numVgprValuPerBlock * kernel["InnerUnroll"] - - if kernel["ProblemType"]["MXBlockB"]: - self.states.mxsb.numVgprValuPerBlock = kernel["MIWaveTileMXSB"] * kernel["MIInputPerThreadMXSB"] // self.states.bpr - # workaround for gfx950 - # need to allocate same amount of MIWaveTile - if isgfx950: - self.states.mxsb.numVgprValuPerBlock = kernel["MIWaveTileMXSB"] - if kernel["DirectToVgprMXSB"] and not (self.states.packDTVB or self.states.convDTVB): - self.states.mxsb.numVgprValuPerBlock = 0 - # MX scale registers are consumed by local-read and wmma paths; avoid - # emitting unresolved vgprValuMXSB symbols when integer division rounds to 0. - elif self.states.mxsb.numVgprValuPerBlock == 0: - self.states.mxsb.numVgprValuPerBlock = kernel["MIWaveTileMXSB"] - self.states.mxsb.numVgprValu = self.states.mxsb.numVgprValuPerBlock * valuBlocksB - if self.states.lrvwTileMXSB > 1: - self.states.mxsb.numVgprValu = self.states.mxsb.numVgprValuPerBlock * kernel["InnerUnroll"] - - else: # mac instruction - valuBlocksA = (1 + kernel["PrefetchLocalRead"]) * kernel["InnerUnroll"] - valuBlocksB = (1 + kernel["PrefetchLocalRead"]) * kernel["InnerUnroll"] + def vgprAllocationImplClassic(): + #################################### + # num vgprs: valu + if kernel["EnableMatrixInstruction"]: + #jgolds bpeCinternal because we are allocating accumulation registers here + self.states.c.numVgprValu = (kernel["ThreadTile0"]*kernel["ThreadTile1"]*self.states.bpeCinternal)//self.states.bpr + + # pack or input conversion DTV case, need double buffer (LoopIters * 2) + numVgprBufferA = self.states.numVgprBuffer if not (self.states.packDTVA or self.states.convDTVA) else kernel["LoopIters"] * 2 + numVgprBufferB = self.states.numVgprBuffer if not (self.states.packDTVB or self.states.convDTVB) else kernel["LoopIters"] * 2 + valuBlocks = self.states.numVgprBuffer * kernel["InnerUnroll"] # for Sparse + valuBlocksA = numVgprBufferA * kernel["InnerUnroll"] + valuBlocksB = numVgprBufferB * kernel["InnerUnroll"] + + self.states.a.numVgprValuPerBlock = int(kernel["MIWaveTileA"] * kernel["MIInputPerThreadA"] * tensorParametersA["bpe"] // self.states.bpr) + self.states.b.numVgprValuPerBlock = int(kernel["MIWaveTileB"] * kernel["MIInputPerThreadB"] * tensorParametersB["bpe"] // self.states.bpr) + + #TODO: remove this if upcoming compiler changes applied + if kernel["ProblemType"]["MacDataTypeA"].numBytes() == 0.75 or kernel["ProblemType"]["MacDataTypeB"].numBytes() == 0.75: + if kernel["enableLDSTrA"]: + numVgprPerSubIter = int(kernel["MIInputPerThreadA"] * tensorParametersA["bpe"] // self.states.bpr) + numVgprPerLocalRead, vgprAlignment = 3, 4 + numLoads = numVgprPerSubIter // numVgprPerLocalRead + self.states.a.numVgprValuPerBlock = kernel["MIWaveTileA"] * numLoads * vgprAlignment + + if kernel["enableLDSTrB"]: + numVgprPerSubIter = int(kernel["MIInputPerThreadB"] * tensorParametersB["bpe"] // self.states.bpr) + numVgprPerLocalRead, vgprAlignment = 3, 4 + numLoads = numVgprPerSubIter // numVgprPerLocalRead + self.states.b.numVgprValuPerBlock = kernel["MIWaveTileB"] * numLoads * vgprAlignment + + # change numVgprValuAPerBlock to 0 if DirectToVgpr is enabled (except for DTV + (pack or input conversion)) + if kernel["DirectToVgprA"] and not (self.states.packDTVA or self.states.convDTVA): + self.states.a.numVgprValuPerBlock = 0 + if kernel["DirectToVgprB"] and not (self.states.packDTVB or self.states.convDTVB): + self.states.b.numVgprValuPerBlock = 0 + + self.states.a.numVgprValu = self.states.a.numVgprValuPerBlock * valuBlocksA + if self.states.lrvwTileA > 1 and tensorParametersA["bpe"] < 4 and not (kernel["UsePLRPack"] and self.states.numItersPLR): + self.states.a.numVgprValu = self.states.a.numVgprValuPerBlock * kernel["InnerUnroll"] + + self.states.b.numVgprValu = self.states.b.numVgprValuPerBlock * valuBlocksB + if self.states.lrvwTileB > 1 and tensorParametersB["bpe"] < 4 and not (kernel["UsePLRPack"] and self.states.numItersPLR): + self.states.b.numVgprValu = self.states.b.numVgprValuPerBlock * kernel["InnerUnroll"] - if kernel["UseDotInstruction"]: - # dot2: at least read NumDotElements elements - self.states.a.numVgprValuPerBlock = int(kernel["ThreadTileA"] * tensorParametersA["bpe"] * kernel["NumDotElements"] // self.states.bpr) - self.states.b.numVgprValuPerBlock = int(kernel["ThreadTileB"] * tensorParametersB["bpe"] * kernel["NumDotElements"] // self.states.bpr) - else: - self.states.a.numVgprValuPerBlock = int(kernel["ThreadTileA"] * tensorParametersA["bpe"] // self.states.bpr) - self.states.b.numVgprValuPerBlock = int(kernel["ThreadTileB"] * tensorParametersB["bpe"] // self.states.bpr) + if kernel["ProblemType"]["MXBlockA"]: + self.states.mxsa.numVgprValuPerBlock = kernel["MIWaveTileMXSA"] * kernel["MIInputPerThreadMXSA"] // self.states.bpr + # workaround for gfx950 + # need to allocate same amount of MIWaveTile + if isgfx950: + self.states.mxsa.numVgprValuPerBlock = kernel["MIWaveTileMXSA"] + if kernel["DirectToVgprMXSA"] and not (self.states.packDTVA or self.states.convDTVA): + self.states.mxsa.numVgprValuPerBlock = 0 + # MX scale registers are consumed by local-read and wmma paths; avoid + # emitting unresolved vgprValuMXSA symbols when integer division rounds to 0. + elif self.states.mxsa.numVgprValuPerBlock == 0: + self.states.mxsa.numVgprValuPerBlock = kernel["MIWaveTileMXSA"] + self.states.mxsa.numVgprValu = self.states.mxsa.numVgprValuPerBlock * valuBlocksA + if self.states.lrvwTileMXSA > 1: + self.states.mxsa.numVgprValu = self.states.mxsa.numVgprValuPerBlock * kernel["InnerUnroll"] - self.states.c.numVgprValu = kernel["ThreadTile0"] * kernel["ThreadTile1"] * kernel["ProblemType"]["ComputeDataType"].numRegisters() - self.states.a.numVgprValu = self.states.a.numVgprValuPerBlock * valuBlocksA - self.states.b.numVgprValu = self.states.b.numVgprValuPerBlock * valuBlocksB + if kernel["ProblemType"]["MXBlockB"]: + self.states.mxsb.numVgprValuPerBlock = kernel["MIWaveTileMXSB"] * kernel["MIInputPerThreadMXSB"] // self.states.bpr + # workaround for gfx950 + # need to allocate same amount of MIWaveTile + if isgfx950: + self.states.mxsb.numVgprValuPerBlock = kernel["MIWaveTileMXSB"] + if kernel["DirectToVgprMXSB"] and not (self.states.packDTVB or self.states.convDTVB): + self.states.mxsb.numVgprValuPerBlock = 0 + # MX scale registers are consumed by local-read and wmma paths; avoid + # emitting unresolved vgprValuMXSB symbols when integer division rounds to 0. + elif self.states.mxsb.numVgprValuPerBlock == 0: + self.states.mxsb.numVgprValuPerBlock = kernel["MIWaveTileMXSB"] + self.states.mxsb.numVgprValu = self.states.mxsb.numVgprValuPerBlock * valuBlocksB + if self.states.lrvwTileMXSB > 1: + self.states.mxsb.numVgprValu = self.states.mxsb.numVgprValuPerBlock * kernel["InnerUnroll"] + + else: # mac instruction + valuBlocksA = (1 + kernel["PrefetchLocalRead"]) * kernel["InnerUnroll"] + valuBlocksB = (1 + kernel["PrefetchLocalRead"]) * kernel["InnerUnroll"] + + if kernel["UseDotInstruction"]: + # dot2: at least read NumDotElements elements + self.states.a.numVgprValuPerBlock = int(kernel["ThreadTileA"] * tensorParametersA["bpe"] * kernel["NumDotElements"] // self.states.bpr) + self.states.b.numVgprValuPerBlock = int(kernel["ThreadTileB"] * tensorParametersB["bpe"] * kernel["NumDotElements"] // self.states.bpr) + else: + self.states.a.numVgprValuPerBlock = int(kernel["ThreadTileA"] * tensorParametersA["bpe"] // self.states.bpr) + self.states.b.numVgprValuPerBlock = int(kernel["ThreadTileB"] * tensorParametersB["bpe"] // self.states.bpr) - if kernel["ProblemType"]["Sparse"]: - if kernel["DirectToVgprSparseMetadata"]: - miWaveTile = kernel["MIWaveTileB"] if kernel["ProblemType"]["Sparse"] == 2 else kernel["MIWaveTileA"] - self.states.m.numVgprValuPerBlock = miWaveTile * kernel["LoopIters"] #every 8bit need 1 register - valuBlocks = (kernel["PrefetchGlobalRead"] + 1) - self.states.m.numVgprValu = self.states.m.numVgprValuPerBlock * valuBlocks - else: - self.states.m.numVgprValuPerBlock = kernel["MIWaveTileMetadata"] * roundUp(kernel["MIInputPerThreadMetadata"] / self.states.bpr) - if kernel["enableLDSTrMetadata"]: - multiplyBy = 1 if kernel["MIInputPerThreadMetadata"] // self.states.bpr == 2 else 2 - self.states.m.numVgprValuPerBlock *= multiplyBy - self.states.m.numVgprValu = self.states.m.numVgprValuPerBlock * valuBlocks - if self.states.lrvwTileMetadata > 1 and tensorParametersM["bpe"] < 4: - self.states.m.numVgprValu = self.states.m.numVgprValuPerBlock * kernel["InnerUnroll"] + self.states.c.numVgprValu = kernel["ThreadTile0"] * kernel["ThreadTile1"] * kernel["ProblemType"]["ComputeDataType"].numRegisters() + self.states.a.numVgprValu = self.states.a.numVgprValuPerBlock * valuBlocksA + self.states.b.numVgprValu = self.states.b.numVgprValuPerBlock * valuBlocksB + if kernel["ProblemType"]["Sparse"]: + if kernel["DirectToVgprSparseMetadata"]: + miWaveTile = kernel["MIWaveTileB"] if kernel["ProblemType"]["Sparse"] == 2 else kernel["MIWaveTileA"] + self.states.m.numVgprValuPerBlock = miWaveTile * kernel["LoopIters"] #every 8bit need 1 register + valuBlocks = (kernel["PrefetchGlobalRead"] + 1) + self.states.m.numVgprValu = self.states.m.numVgprValuPerBlock * valuBlocks + else: + self.states.m.numVgprValuPerBlock = kernel["MIWaveTileMetadata"] * roundUp(kernel["MIInputPerThreadMetadata"] / self.states.bpr) + if kernel["enableLDSTrMetadata"]: + multiplyBy = 1 if kernel["MIInputPerThreadMetadata"] // self.states.bpr == 2 else 2 + self.states.m.numVgprValuPerBlock *= multiplyBy + self.states.m.numVgprValu = self.states.m.numVgprValuPerBlock * valuBlocks + if self.states.lrvwTileMetadata > 1 and tensorParametersM["bpe"] < 4: + self.states.m.numVgprValu = self.states.m.numVgprValuPerBlock * kernel["InnerUnroll"] - #################################### - # num vgprs: global -> local elements A - self.states.a.numVgprG2L = 0 - numVgprG2LAllocatedLocal = 0 - - bpeMax = tensorParametersA["bpeDS"] if kernel["ConvertAfterDS"] else max(tensorParametersA["bpeGR"], tensorParametersA["bpe"]) - statesANumVgprG2L = roundUp((kernel["NumLoadsCoalescedA"] * kernel["NumLoadsPerpendicularA"] * \ - kernel["GlobalReadVectorWidthA"] * bpeMax) / (float)(self.states.bpr)) - tpA = self.states.bpr if bpeMax * vwa < self.states.bpr else bpeMax * vwa - tpALocal = self.states.bpr if tensorParametersA["bpe"] * vwa < self.states.bpr else tensorParametersA["bpe"] * vwa - numVgprG2LAllocatedLocal = roundUp((kernel["NumLoadsCoalescedA"] * kernel["NumLoadsPerpendicularA"] * \ - tpALocal) / (float)(self.states.bpr)) - if (self.states.archCaps["HasEccHalf"] or not self.states.asmCaps["HasWMMA_V1"]): - if bpeMax * vwa < self.states.bpr: - # This check is to reserve porential usage of VGPRs for gfx12 8-bit code gen - # We should optimize the usage for better performance. - statesANumVgprG2LAllocated = statesANumVgprG2L * (int)(self.states.bpr/(bpeMax * vwa)) - else: - # This check is to reserve porential usage of VGPRs for gfx12 8-bit code gen - # We should optimize the usage for better performance. - statesANumVgprG2LAllocated = roundUp((kernel["NumLoadsCoalescedA"] * kernel["NumLoadsPerpendicularA"] * \ - tpA) / (float)(self.states.bpr)) - #TODO: remove this if upcoming compiler changes getting merged - if tensorParametersA["globalReadInstruction"].blockWidth == 3: - statesANumVgprG2LAllocated = roundUp(statesANumVgprG2LAllocated * 4 / 3) - else: - statesANumVgprG2LAllocated = statesANumVgprG2L - if (not kernel["DirectToLdsA"] or self.do["KeepDirectToLdsAlloc"]) and not kernel["enableTDMA"]: - self.states.a.numVgprG2L = statesANumVgprG2L - self.states.a.numVgprG2LAllocated = statesANumVgprG2LAllocated - self.states.a.numVgprG2LTailloopAllocated = statesANumVgprG2LAllocated if tensorParametersA["globalReadInstruction"].blockWidth != 6 else roundUp(statesANumVgprG2LAllocated * 4 / 3) - else: + #################################### + # num vgprs: global -> local elements A self.states.a.numVgprG2L = 0 - self.states.a.numVgprG2LAllocated = 0 - self.states.a.numVgprG2LTailloopAllocated = statesANumVgprG2LAllocated if tensorParametersA["globalReadInstruction"].blockWidth != 6 else roundUp(statesANumVgprG2LAllocated * 4 / 3) - # using _ds_store_b8: need one more vgpr space to do lshr - if tensorParametersA["localWriteInstruction"].blockWidth == 0.25: - self.states.a.numVgprG2L = self.states.a.numVgprG2L * 2 - self.states.a.numVgprG2LAllocated += numVgprG2LAllocatedLocal - self.states.a.numVgprG2LTailloopAllocated += numVgprG2LAllocatedLocal - # double numVgprG2L if DirectToVgpr is enabled - if kernel["DirectToVgprA"]: - self.states.a.numVgprG2L *= 2 - self.states.a.numVgprG2LAllocated *= 2 - bpeA = tensorParametersA["bpe"] - bpeGRA = tensorParametersA["bpeGR"] - if kernel["ConvertAfterDS"] and bpeA > bpeGRA: - # DTV + covertAfterDS case, we need to allocate vgpr based on after conversion - self.states.a.numVgprG2L *= int(bpeA // bpeGRA) - self.states.a.numVgprG2LAllocated *= int(bpeA // bpeGRA) - - # num vgprs: global -> local elements : MXSA - if kernel["ProblemType"]["MXBlockA"]: - self.states.mxsa.numVgprG2L = 0 - numVgprG2LMXSAllocatedLocal = 0 - - statesMXSANumVgprG2L = roundUp((kernel["NumLoadsCoalescedMXSA"] * kernel["NumLoadsPerpendicularMXSA"] * \ - kernel["GlobalReadVectorWidthMXSA"]) / (float)(self.states.bpr)) - tpMXSALocal = self.states.bpr if vwmxsa < self.states.bpr else vwmxsa - numVgprG2LMXSAllocatedLocal = roundUp((kernel["NumLoadsCoalescedMXSA"] * kernel["NumLoadsPerpendicularMXSA"] * \ - tpMXSALocal) / (float)(self.states.bpr)) - - if (self.states.archCaps["HasEccHalf"] or not self.states.asmCaps["HasWMMA_V1"]) and (vwmxsa < self.states.bpr): - # This check is to reserve porential usage of VGPRs for gfx12 8-bit code gen - # We should optimize the usage for better performance. - statesMXSANumVgprG2LAllocated = statesMXSANumVgprG2L * (int)(self.states.bpr/vwmxsa) + numVgprG2LAllocatedLocal = 0 + + bpeMax = tensorParametersA["bpeDS"] if kernel["ConvertAfterDS"] else max(tensorParametersA["bpeGR"], tensorParametersA["bpe"]) + statesANumVgprG2L = roundUp((kernel["NumLoadsCoalescedA"] * kernel["NumLoadsPerpendicularA"] * \ + kernel["GlobalReadVectorWidthA"] * bpeMax) / (float)(self.states.bpr)) + tpA = self.states.bpr if bpeMax * vwa < self.states.bpr else bpeMax * vwa + tpALocal = self.states.bpr if tensorParametersA["bpe"] * vwa < self.states.bpr else tensorParametersA["bpe"] * vwa + numVgprG2LAllocatedLocal = roundUp((kernel["NumLoadsCoalescedA"] * kernel["NumLoadsPerpendicularA"] * \ + tpALocal) / (float)(self.states.bpr)) + if (self.states.archCaps["HasEccHalf"] or not self.states.asmCaps["HasWMMA_V1"]): + if bpeMax * vwa < self.states.bpr: + # This check is to reserve porential usage of VGPRs for gfx12 8-bit code gen + # We should optimize the usage for better performance. + statesANumVgprG2LAllocated = statesANumVgprG2L * (int)(self.states.bpr/(bpeMax * vwa)) + else: + # This check is to reserve porential usage of VGPRs for gfx12 8-bit code gen + # We should optimize the usage for better performance. + statesANumVgprG2LAllocated = roundUp((kernel["NumLoadsCoalescedA"] * kernel["NumLoadsPerpendicularA"] * \ + tpA) / (float)(self.states.bpr)) + #TODO: remove this if upcoming compiler changes getting merged + if tensorParametersA["globalReadInstruction"].blockWidth == 3: + statesANumVgprG2LAllocated = roundUp(statesANumVgprG2LAllocated * 4 / 3) else: - statesMXSANumVgprG2LAllocated = statesMXSANumVgprG2L + statesANumVgprG2LAllocated = statesANumVgprG2L - if (not kernel["DirectToLdsMXSA"] or self.do["KeepDirectToLdsAlloc"]) and not kernel["enableTDMA"]: - self.states.mxsa.numVgprG2L = statesMXSANumVgprG2L - self.states.mxsa.numVgprG2LAllocated = statesMXSANumVgprG2LAllocated - self.states.mxsa.numVgprG2LTailloopAllocated = self.states.mxsa.numVgprG2LAllocated + if (not kernel["DirectToLdsA"] or self.do["KeepDirectToLdsAlloc"]) and not kernel["enableTDMA"]: + self.states.a.numVgprG2L = statesANumVgprG2L + self.states.a.numVgprG2LAllocated = statesANumVgprG2LAllocated + self.states.a.numVgprG2LTailloopAllocated = statesANumVgprG2LAllocated if tensorParametersA["globalReadInstruction"].blockWidth != 6 else roundUp(statesANumVgprG2LAllocated * 4 / 3) else: - self.states.mxsa.numVgprG2L = 0 - self.states.mxsa.numVgprG2LAllocated = 0 - self.states.mxsa.numVgprG2LTailloopAllocated = statesMXSANumVgprG2LAllocated + self.states.a.numVgprG2L = 0 + self.states.a.numVgprG2LAllocated = 0 + self.states.a.numVgprG2LTailloopAllocated = statesANumVgprG2LAllocated if tensorParametersA["globalReadInstruction"].blockWidth != 6 else roundUp(statesANumVgprG2LAllocated * 4 / 3) # using _ds_store_b8: need one more vgpr space to do lshr - if tensorParametersMXSA["localWriteInstruction"].blockWidth == 0.25: - self.states.mxsa.numVgprG2L = self.states.mxsa.numVgprG2L * 2 - self.states.mxsa.numVgprG2LAllocated += numVgprG2LMXSAllocatedLocal - self.states.mxsa.numVgprG2LTailloopAllocated += numVgprG2LMXSAllocatedLocal + if tensorParametersA["localWriteInstruction"].blockWidth == 0.25: + self.states.a.numVgprG2L = self.states.a.numVgprG2L * 2 + self.states.a.numVgprG2LAllocated += numVgprG2LAllocatedLocal + self.states.a.numVgprG2LTailloopAllocated += numVgprG2LAllocatedLocal # double numVgprG2L if DirectToVgpr is enabled - if kernel["DirectToVgprMXSA"]: - self.states.mxsa.numVgprG2L *= 2 - self.states.mxsa.numVgprG2LAllocated *= 2 - - # num vgprs: global -> local elements : B - self.states.b.numVgprG2L = 0 - numVgprG2LAllocatedLocal = 0 - - bpeMax = tensorParametersB["bpeDS"] if kernel["ConvertAfterDS"] else max(tensorParametersB["bpeGR"], tensorParametersB["bpe"]) - statesBNumVgprG2L = roundUp((kernel["NumLoadsCoalescedB"] * kernel["NumLoadsPerpendicularB"] * \ - kernel["GlobalReadVectorWidthB"] * bpeMax) / (float)(self.states.bpr)) - tpB = self.states.bpr if bpeMax * vwb < self.states.bpr else bpeMax * vwb - tpBLocal = self.states.bpr if tensorParametersB["bpe"] * vwb < self.states.bpr else tensorParametersB["bpe"] * vwb - numVgprG2LAllocatedLocal = roundUp((kernel["NumLoadsCoalescedB"] * kernel["NumLoadsPerpendicularB"] * \ - tpBLocal) / (float)(self.states.bpr)) - - if (self.states.archCaps["HasEccHalf"] or not self.states.asmCaps["HasWMMA_V1"]): - if bpeMax * vwb < self.states.bpr: - # This check is to reserve porential usage of VGPRs for gfx12 8-bit code gen - # We should optimize the usage for better performance. - statesBNumVgprG2LAllocated = statesBNumVgprG2L * (int)(self.states.bpr/(bpeMax * vwb)) - else: - # This check is to reserve porential usage of VGPRs for gfx12 8-bit code gen - # We should optimize the usage for better performance. - statesBNumVgprG2LAllocated = roundUp((kernel["NumLoadsCoalescedB"] * kernel["NumLoadsPerpendicularB"] * \ - tpB) / (float)(self.states.bpr)) - if tensorParametersB["globalReadInstruction"].blockWidth == 3: - statesBNumVgprG2LAllocated = roundUp(statesBNumVgprG2LAllocated * 4 / 3) - else: - statesBNumVgprG2LAllocated = statesBNumVgprG2L - if not kernel["DirectToLdsB"] or self.do["KeepDirectToLdsAlloc"] and not kernel["enableTDMB"]: - self.states.b.numVgprG2L = statesBNumVgprG2L - self.states.b.numVgprG2LAllocated = statesBNumVgprG2LAllocated - self.states.b.numVgprG2LTailloopAllocated = statesBNumVgprG2LAllocated if tensorParametersB["globalReadInstruction"].blockWidth != 6 else roundUp(statesBNumVgprG2LAllocated * 4 / 3) - else: + if kernel["DirectToVgprA"]: + self.states.a.numVgprG2L *= 2 + self.states.a.numVgprG2LAllocated *= 2 + bpeA = tensorParametersA["bpe"] + bpeGRA = tensorParametersA["bpeGR"] + if kernel["ConvertAfterDS"] and bpeA > bpeGRA: + # DTV + covertAfterDS case, we need to allocate vgpr based on after conversion + self.states.a.numVgprG2L *= int(bpeA // bpeGRA) + self.states.a.numVgprG2LAllocated *= int(bpeA // bpeGRA) + + # num vgprs: global -> local elements : MXSA + if kernel["ProblemType"]["MXBlockA"]: + self.states.mxsa.numVgprG2L = 0 + numVgprG2LMXSAllocatedLocal = 0 + + statesMXSANumVgprG2L = roundUp((kernel["NumLoadsCoalescedMXSA"] * kernel["NumLoadsPerpendicularMXSA"] * \ + kernel["GlobalReadVectorWidthMXSA"]) / (float)(self.states.bpr)) + tpMXSALocal = self.states.bpr if vwmxsa < self.states.bpr else vwmxsa + numVgprG2LMXSAllocatedLocal = roundUp((kernel["NumLoadsCoalescedMXSA"] * kernel["NumLoadsPerpendicularMXSA"] * \ + tpMXSALocal) / (float)(self.states.bpr)) + + if (self.states.archCaps["HasEccHalf"] or not self.states.asmCaps["HasWMMA_V1"]) and (vwmxsa < self.states.bpr): + # This check is to reserve porential usage of VGPRs for gfx12 8-bit code gen + # We should optimize the usage for better performance. + statesMXSANumVgprG2LAllocated = statesMXSANumVgprG2L * (int)(self.states.bpr/vwmxsa) + else: + statesMXSANumVgprG2LAllocated = statesMXSANumVgprG2L + + if (not kernel["DirectToLdsMXSA"] or self.do["KeepDirectToLdsAlloc"]) and not kernel["enableTDMA"]: + self.states.mxsa.numVgprG2L = statesMXSANumVgprG2L + self.states.mxsa.numVgprG2LAllocated = statesMXSANumVgprG2LAllocated + self.states.mxsa.numVgprG2LTailloopAllocated = self.states.mxsa.numVgprG2LAllocated + else: + self.states.mxsa.numVgprG2L = 0 + self.states.mxsa.numVgprG2LAllocated = 0 + self.states.mxsa.numVgprG2LTailloopAllocated = statesMXSANumVgprG2LAllocated + # using _ds_store_b8: need one more vgpr space to do lshr + if tensorParametersMXSA["localWriteInstruction"].blockWidth == 0.25: + self.states.mxsa.numVgprG2L = self.states.mxsa.numVgprG2L * 2 + self.states.mxsa.numVgprG2LAllocated += numVgprG2LMXSAllocatedLocal + self.states.mxsa.numVgprG2LTailloopAllocated += numVgprG2LMXSAllocatedLocal + # double numVgprG2L if DirectToVgpr is enabled + if kernel["DirectToVgprMXSA"]: + self.states.mxsa.numVgprG2L *= 2 + self.states.mxsa.numVgprG2LAllocated *= 2 + + # num vgprs: global -> local elements : B self.states.b.numVgprG2L = 0 - self.states.b.numVgprG2LAllocated = 0 - self.states.b.numVgprG2LTailloopAllocated = statesBNumVgprG2LAllocated if tensorParametersB["globalReadInstruction"].blockWidth != 6 else roundUp(statesBNumVgprG2LAllocated * 4 / 3) - # using _ds_store_b8: need one more vgpr space to do lshr - if tensorParametersB["localWriteInstruction"].blockWidth == 0.25: - self.states.b.numVgprG2L = self.states.b.numVgprG2L * 2 - self.states.b.numVgprG2LAllocated += numVgprG2LAllocatedLocal - self.states.b.numVgprG2LTailloopAllocated += numVgprG2LAllocatedLocal - # double numVgprG2L if DirectToVgpr is enabled - if kernel["DirectToVgprB"]: - self.states.b.numVgprG2L *= 2 - self.states.b.numVgprG2LAllocated *= 2 - bpeB = tensorParametersB["bpe"] - bpeGRB = tensorParametersB["bpeGR"] - if kernel["ConvertAfterDS"] and bpeB > bpeGRB: - # DTV + covertAfterDS case, we need to allocate vgpr based on after conversion - self.states.b.numVgprG2L *= int(bpeB // bpeGRB) - self.states.b.numVgprG2LAllocated *= int(bpeB // bpeGRB) - - # num vgprs: global -> local elements : MXSB - if kernel["ProblemType"]["MXBlockB"]: - self.states.mxsb.numVgprG2L = 0 - numVgprG2LMXSBllocatedLocal = 0 - - statesMXSBNumVgprG2L = roundUp((kernel["NumLoadsCoalescedMXSB"] * kernel["NumLoadsPerpendicularMXSB"] * \ - kernel["GlobalReadVectorWidthMXSB"]) / (float)(self.states.bpr)) - tpMXSBLocal = self.states.bpr if vwmxsb < self.states.bpr else vwmxsb - numVgprG2LMXSBllocatedLocal = roundUp((kernel["NumLoadsCoalescedMXSB"] * kernel["NumLoadsPerpendicularMXSB"] * \ - tpMXSBLocal) / (float)(self.states.bpr)) - - if (self.states.archCaps["HasEccHalf"] or not self.states.asmCaps["HasWMMA_V1"]) and (vwmxsb < self.states.bpr): - # This check is to reserve porential usage of VGPRs for gfx12 8-bit code gen - # We should optimize the usage for better performance. - statesMXSBNumVgprG2LAllocated = statesMXSBNumVgprG2L * (int)(self.states.bpr/vwmxsb) + numVgprG2LAllocatedLocal = 0 + + bpeMax = tensorParametersB["bpeDS"] if kernel["ConvertAfterDS"] else max(tensorParametersB["bpeGR"], tensorParametersB["bpe"]) + statesBNumVgprG2L = roundUp((kernel["NumLoadsCoalescedB"] * kernel["NumLoadsPerpendicularB"] * \ + kernel["GlobalReadVectorWidthB"] * bpeMax) / (float)(self.states.bpr)) + tpB = self.states.bpr if bpeMax * vwb < self.states.bpr else bpeMax * vwb + tpBLocal = self.states.bpr if tensorParametersB["bpe"] * vwb < self.states.bpr else tensorParametersB["bpe"] * vwb + numVgprG2LAllocatedLocal = roundUp((kernel["NumLoadsCoalescedB"] * kernel["NumLoadsPerpendicularB"] * \ + tpBLocal) / (float)(self.states.bpr)) + + if (self.states.archCaps["HasEccHalf"] or not self.states.asmCaps["HasWMMA_V1"]): + if bpeMax * vwb < self.states.bpr: + # This check is to reserve porential usage of VGPRs for gfx12 8-bit code gen + # We should optimize the usage for better performance. + statesBNumVgprG2LAllocated = statesBNumVgprG2L * (int)(self.states.bpr/(bpeMax * vwb)) + else: + # This check is to reserve porential usage of VGPRs for gfx12 8-bit code gen + # We should optimize the usage for better performance. + statesBNumVgprG2LAllocated = roundUp((kernel["NumLoadsCoalescedB"] * kernel["NumLoadsPerpendicularB"] * \ + tpB) / (float)(self.states.bpr)) + if tensorParametersB["globalReadInstruction"].blockWidth == 3: + statesBNumVgprG2LAllocated = roundUp(statesBNumVgprG2LAllocated * 4 / 3) else: - statesMXSBNumVgprG2LAllocated = statesMXSBNumVgprG2L - - if (not kernel["DirectToLdsMXSB"] or self.do["KeepDirectToLdsAlloc"]) and not kernel["enableTDMB"]: - self.states.mxsb.numVgprG2L = statesMXSBNumVgprG2L - self.states.mxsb.numVgprG2LAllocated = statesMXSBNumVgprG2LAllocated - self.states.mxsb.numVgprG2LTailloopAllocated = self.states.mxsb.numVgprG2LAllocated + statesBNumVgprG2LAllocated = statesBNumVgprG2L + if not kernel["DirectToLdsB"] or self.do["KeepDirectToLdsAlloc"] and not kernel["enableTDMB"]: + self.states.b.numVgprG2L = statesBNumVgprG2L + self.states.b.numVgprG2LAllocated = statesBNumVgprG2LAllocated + self.states.b.numVgprG2LTailloopAllocated = statesBNumVgprG2LAllocated if tensorParametersB["globalReadInstruction"].blockWidth != 6 else roundUp(statesBNumVgprG2LAllocated * 4 / 3) else: - self.states.mxsb.numVgprG2L = 0 - self.states.mxsb.numVgprG2LAllocated = 0 - self.states.mxsb.numVgprG2LTailloopAllocated = statesMXSBNumVgprG2LAllocated + self.states.b.numVgprG2L = 0 + self.states.b.numVgprG2LAllocated = 0 + self.states.b.numVgprG2LTailloopAllocated = statesBNumVgprG2LAllocated if tensorParametersB["globalReadInstruction"].blockWidth != 6 else roundUp(statesBNumVgprG2LAllocated * 4 / 3) # using _ds_store_b8: need one more vgpr space to do lshr - if tensorParametersMXSB["localWriteInstruction"].blockWidth == 0.25: - self.states.mxsb.numVgprG2L = self.states.mxsb.numVgprG2L * 2 - self.states.mxsb.numVgprG2LAllocated += numVgprG2LMXSBllocatedLocal - self.states.mxsb.numVgprG2LTailloopAllocated += numVgprG2LMXSBllocatedLocal + if tensorParametersB["localWriteInstruction"].blockWidth == 0.25: + self.states.b.numVgprG2L = self.states.b.numVgprG2L * 2 + self.states.b.numVgprG2LAllocated += numVgprG2LAllocatedLocal + self.states.b.numVgprG2LTailloopAllocated += numVgprG2LAllocatedLocal # double numVgprG2L if DirectToVgpr is enabled - if kernel["DirectToVgprMXSB"]: - self.states.mxsb.numVgprG2L *= 2 - self.states.mxsb.numVgprG2LAllocated *= 2 + if kernel["DirectToVgprB"]: + self.states.b.numVgprG2L *= 2 + self.states.b.numVgprG2LAllocated *= 2 + bpeB = tensorParametersB["bpe"] + bpeGRB = tensorParametersB["bpeGR"] + if kernel["ConvertAfterDS"] and bpeB > bpeGRB: + # DTV + covertAfterDS case, we need to allocate vgpr based on after conversion + self.states.b.numVgprG2L *= int(bpeB // bpeGRB) + self.states.b.numVgprG2LAllocated *= int(bpeB // bpeGRB) + + # num vgprs: global -> local elements : MXSB + if kernel["ProblemType"]["MXBlockB"]: + self.states.mxsb.numVgprG2L = 0 + numVgprG2LMXSBllocatedLocal = 0 + + statesMXSBNumVgprG2L = roundUp((kernel["NumLoadsCoalescedMXSB"] * kernel["NumLoadsPerpendicularMXSB"] * \ + kernel["GlobalReadVectorWidthMXSB"]) / (float)(self.states.bpr)) + tpMXSBLocal = self.states.bpr if vwmxsb < self.states.bpr else vwmxsb + numVgprG2LMXSBllocatedLocal = roundUp((kernel["NumLoadsCoalescedMXSB"] * kernel["NumLoadsPerpendicularMXSB"] * \ + tpMXSBLocal) / (float)(self.states.bpr)) + + if (self.states.archCaps["HasEccHalf"] or not self.states.asmCaps["HasWMMA_V1"]) and (vwmxsb < self.states.bpr): + # This check is to reserve porential usage of VGPRs for gfx12 8-bit code gen + # We should optimize the usage for better performance. + statesMXSBNumVgprG2LAllocated = statesMXSBNumVgprG2L * (int)(self.states.bpr/vwmxsb) + else: + statesMXSBNumVgprG2LAllocated = statesMXSBNumVgprG2L - # num vgprs: global -> local elements : Metadata - self.states.m.numVgprG2L = 0 - if kernel["ProblemType"]["Sparse"]: - if not kernel["DirectToVgprSparseMetadata"]: - self.states.m.numVgprG2L = roundUp((kernel["NumLoadsCoalescedMetadata"] * kernel["NumLoadsPerpendicularMetadata"] * \ - kernel["GlobalReadVectorWidthMetadata"] * tensorParametersM["bpeDS"]) / (float)(self.states.bpr)) - if self.states.archCaps["HasEccHalf"] or not self.states.asmCaps["HasWMMA_V1"]: - tpM = self.states.bpr if tensorParametersM["bpeDS"] * vwm < self.states.bpr else tensorParametersM["bpeDS"] * vwm - self.states.m.numVgprG2LAllocated = roundUp((kernel["NumLoadsCoalescedMetadata"] * kernel["NumLoadsPerpendicularMetadata"] * \ - tpM) / (float)(self.states.bpr)) - self.states.m.numVgprG2L = self.states.m.numVgprG2LAllocated + if (not kernel["DirectToLdsMXSB"] or self.do["KeepDirectToLdsAlloc"]) and not kernel["enableTDMB"]: + self.states.mxsb.numVgprG2L = statesMXSBNumVgprG2L + self.states.mxsb.numVgprG2LAllocated = statesMXSBNumVgprG2LAllocated + self.states.mxsb.numVgprG2LTailloopAllocated = self.states.mxsb.numVgprG2LAllocated + else: + self.states.mxsb.numVgprG2L = 0 + self.states.mxsb.numVgprG2LAllocated = 0 + self.states.mxsb.numVgprG2LTailloopAllocated = statesMXSBNumVgprG2LAllocated # using _ds_store_b8: need one more vgpr space to do lshr - if tensorParametersM["localWriteInstruction"].blockWidth == 0.25: - self.states.m.numVgprG2L = self.states.m.numVgprG2L * 2 - self.states.m.numVgprG2LAllocated = self.states.m.numVgprG2LAllocated * 2 - else: - self.states.m.numVgprG2LAllocated = 0 - #################################### - # num vgprs: local read addresses - self.states.a.numVgprLocalReadAddr = 1 * self.states.rpla - self.states.b.numVgprLocalReadAddr = 1 * self.states.rpla - self.states.m.numVgprLocalReadAddr = 1 * self.states.rpla - if kernel["ProblemType"]["MXBlockA"]: - self.states.mxsa.numVgprLocalReadAddr = 1 * self.states.rpla - if kernel["ProblemType"]["MXBlockB"]: - self.states.mxsb.numVgprLocalReadAddr = 1 * self.states.rpla + if tensorParametersMXSB["localWriteInstruction"].blockWidth == 0.25: + self.states.mxsb.numVgprG2L = self.states.mxsb.numVgprG2L * 2 + self.states.mxsb.numVgprG2LAllocated += numVgprG2LMXSBllocatedLocal + self.states.mxsb.numVgprG2LTailloopAllocated += numVgprG2LMXSBllocatedLocal + # double numVgprG2L if DirectToVgpr is enabled + if kernel["DirectToVgprMXSB"]: + self.states.mxsb.numVgprG2L *= 2 + self.states.mxsb.numVgprG2LAllocated *= 2 + + # num vgprs: global -> local elements : Metadata + self.states.m.numVgprG2L = 0 + if kernel["ProblemType"]["Sparse"]: + if not kernel["DirectToVgprSparseMetadata"]: + self.states.m.numVgprG2L = roundUp((kernel["NumLoadsCoalescedMetadata"] * kernel["NumLoadsPerpendicularMetadata"] * \ + kernel["GlobalReadVectorWidthMetadata"] * tensorParametersM["bpeDS"]) / (float)(self.states.bpr)) + if self.states.archCaps["HasEccHalf"] or not self.states.asmCaps["HasWMMA_V1"]: + tpM = self.states.bpr if tensorParametersM["bpeDS"] * vwm < self.states.bpr else tensorParametersM["bpeDS"] * vwm + self.states.m.numVgprG2LAllocated = roundUp((kernel["NumLoadsCoalescedMetadata"] * kernel["NumLoadsPerpendicularMetadata"] * \ + tpM) / (float)(self.states.bpr)) + self.states.m.numVgprG2L = self.states.m.numVgprG2LAllocated + # using _ds_store_b8: need one more vgpr space to do lshr + if tensorParametersM["localWriteInstruction"].blockWidth == 0.25: + self.states.m.numVgprG2L = self.states.m.numVgprG2L * 2 + self.states.m.numVgprG2LAllocated = self.states.m.numVgprG2LAllocated * 2 + else: + self.states.m.numVgprG2LAllocated = 0 + #################################### + # num vgprs: local read addresses + self.states.a.numVgprLocalReadAddr = 1 * self.states.rpla + self.states.b.numVgprLocalReadAddr = 1 * self.states.rpla + self.states.m.numVgprLocalReadAddr = 1 * self.states.rpla + if kernel["ProblemType"]["MXBlockA"]: + self.states.mxsa.numVgprLocalReadAddr = 1 * self.states.rpla + if kernel["ProblemType"]["MXBlockB"]: + self.states.mxsb.numVgprLocalReadAddr = 1 * self.states.rpla - self.states.a.numVgprLocalWriteAddr = 0 if kernel["LocalWriteUseSgprA"] or kernel["enableTDMA"] else 1 * self.states.rpla - self.states.b.numVgprLocalWriteAddr = 0 if kernel["LocalWriteUseSgprB"] or kernel["enableTDMB"] else 1 * self.states.rpla - self.states.m.numVgprLocalWriteAddr = 0 if kernel["ProblemType"]["Sparse"] and kernel["LocalWriteUseSgprMetadata"] else 1 * self.states.rpla - if kernel["ProblemType"]["MXBlockA"]: - self.states.mxsa.numVgprLocalWriteAddr = 0 if kernel["LocalWriteUseSgprMXSA"] or kernel["enableTDMA"] else 1 * self.states.rpla - if kernel["ProblemType"]["MXBlockB"]: - self.states.mxsb.numVgprLocalWriteAddr = 0 if kernel["LocalWriteUseSgprMXSB"] or kernel["enableTDMB"] else 1 * self.states.rpla + self.states.a.numVgprLocalWriteAddr = 0 if kernel["LocalWriteUseSgprA"] or kernel["enableTDMA"] else 1 * self.states.rpla + self.states.b.numVgprLocalWriteAddr = 0 if kernel["LocalWriteUseSgprB"] or kernel["enableTDMB"] else 1 * self.states.rpla + self.states.m.numVgprLocalWriteAddr = 0 if kernel["ProblemType"]["Sparse"] and kernel["LocalWriteUseSgprMetadata"] else 1 * self.states.rpla + if kernel["ProblemType"]["MXBlockA"]: + self.states.mxsa.numVgprLocalWriteAddr = 0 if kernel["LocalWriteUseSgprMXSA"] or kernel["enableTDMA"] else 1 * self.states.rpla + if kernel["ProblemType"]["MXBlockB"]: + self.states.mxsb.numVgprLocalWriteAddr = 0 if kernel["LocalWriteUseSgprMXSB"] or kernel["enableTDMB"] else 1 * self.states.rpla - self.states.a.numVgprLocalReadSwapAddr = 0 - self.states.b.numVgprLocalReadSwapAddr = 0 - self.states.m.numVgprLocalReadSwapAddr = 0 - if kernel["ProblemType"]["MXBlockA"]: - self.states.mxsa.numVgprLocalReadSwapAddr = 0 - if kernel["ProblemType"]["MXBlockB"]: - self.states.mxsb.numVgprLocalReadSwapAddr = 0 + self.states.a.numVgprLocalReadSwapAddr = 0 + self.states.b.numVgprLocalReadSwapAddr = 0 + self.states.m.numVgprLocalReadSwapAddr = 0 + if kernel["ProblemType"]["MXBlockA"]: + self.states.mxsa.numVgprLocalReadSwapAddr = 0 + if kernel["ProblemType"]["MXBlockB"]: + self.states.mxsb.numVgprLocalReadSwapAddr = 0 - self.states.a.numVgprLocalWriteSwapAddr = 0 - self.states.b.numVgprLocalWriteSwapAddr = 0 - self.states.m.numVgprLocalWriteSwapAddr = 0 - if kernel["ProblemType"]["MXBlockA"]: - self.states.mxsa.numVgprLocalWriteSwapAddr = 0 - if kernel["ProblemType"]["MXBlockB"]: - self.states.mxsb.numVgprLocalWriteSwapAddr = 0 + self.states.a.numVgprLocalWriteSwapAddr = 0 + self.states.b.numVgprLocalWriteSwapAddr = 0 + self.states.m.numVgprLocalWriteSwapAddr = 0 + if kernel["ProblemType"]["MXBlockA"]: + self.states.mxsa.numVgprLocalWriteSwapAddr = 0 + if kernel["ProblemType"]["MXBlockB"]: + self.states.mxsb.numVgprLocalWriteSwapAddr = 0 - self.states.a.numVgprLocalWriteAddrTailLoop = 0 if not (kernel["DirectToLdsA"] and kernel["NonDTLTailLoopA"]) else 1 * self.states.rpla - self.states.b.numVgprLocalWriteAddrTailLoop = 0 if not (kernel["DirectToLdsB"] and kernel["NonDTLTailLoopB"]) else 1 * self.states.rpla - if kernel["ProblemType"]["MXBlockA"]: - self.states.mxsa.numVgprLocalWriteAddrTailLoop = 0 if not (kernel["DirectToLdsMXSA"] and kernel["NonDTLTailLoopMXSA"]) else 1 * self.states.rpla - if kernel["ProblemType"]["MXBlockB"]: - self.states.mxsb.numVgprLocalWriteAddrTailLoop = 0 if not (kernel["DirectToLdsMXSB"] and kernel["NonDTLTailLoopMXSB"]) else 1 * self.states.rpla + self.states.a.numVgprLocalWriteAddrTailLoop = 0 if not (kernel["DirectToLdsA"] and kernel["NonDTLTailLoopA"]) else 1 * self.states.rpla + self.states.b.numVgprLocalWriteAddrTailLoop = 0 if not (kernel["DirectToLdsB"] and kernel["NonDTLTailLoopB"]) else 1 * self.states.rpla + if kernel["ProblemType"]["MXBlockA"]: + self.states.mxsa.numVgprLocalWriteAddrTailLoop = 0 if not (kernel["DirectToLdsMXSA"] and kernel["NonDTLTailLoopMXSA"]) else 1 * self.states.rpla + if kernel["ProblemType"]["MXBlockB"]: + self.states.mxsb.numVgprLocalWriteAddrTailLoop = 0 if not (kernel["DirectToLdsMXSB"] and kernel["NonDTLTailLoopMXSB"]) else 1 * self.states.rpla - numVgprMultiplierA = 1 - numVgprMultiplierB = 1 - numVgprMultiplierMXSA = 1 - numVgprMultiplierMXSB = 1 - numVgprMultiplierMetadata = 1 - if kernel["ProblemType"]["MXBlockA"]: + numVgprMultiplierA = 1 + numVgprMultiplierB = 1 numVgprMultiplierMXSA = 1 - if kernel["ProblemType"]["MXBlockB"]: numVgprMultiplierMXSB = 1 + numVgprMultiplierMetadata = 1 + if kernel["ProblemType"]["MXBlockA"]: + numVgprMultiplierMXSA = 1 + if kernel["ProblemType"]["MXBlockB"]: + numVgprMultiplierMXSB = 1 + + maxLDSConstOffset = self.states.regCaps["maxLDSConstOffset"] + if self.states.archCaps["DeviceLDS"] > maxLDSConstOffset: + hasMultipleBuffer = kernel["ExpandPointerSwap"] and not kernel["1LDSBuffer"] and not kernel["StoreSwapAddr"] + maxOffsetA = kernel["LdsNumElementsAlignedA"] + maxOffsetB = kernel["LdsNumElementsAlignedB"] + maxOffsetMXSA = kernel["LdsNumElementsAlignedMXSA"] + maxOffsetMXSB = kernel["LdsNumElementsAlignedMXSB"] + maxOffsetMetadata = kernel["LdsNumElementsAlignedMetadata"] + maxOffsetMetadata = kernel["LdsNumElementsAlignedMetadata"] + if hasMultipleBuffer: + maxOffsetA += kernel["LdsOffsetA_Blk"] + maxOffsetB += kernel["LdsOffsetA_Blk"] + maxOffsetMXSA += kernel["LdsOffsetA_Blk"] + maxOffsetMXSB += kernel["LdsOffsetA_Blk"] + maxOffsetMetadata += kernel["LdsOffsetA_Blk"] + + numVgprMultiplierA = maxOffsetA // maxLDSConstOffset + 1 + numVgprMultiplierB = maxOffsetB // maxLDSConstOffset + 1 + numVgprMultiplierMXSA = maxOffsetMXSA // maxLDSConstOffset + 1 + numVgprMultiplierMXSB = maxOffsetMXSB // maxLDSConstOffset + 1 + numVgprMultiplierMetadata = maxOffsetMetadata // maxLDSConstOffset + 1 + + self.states.a.numVgprLocalReadAddr *= numVgprMultiplierA + self.states.a.numVgprLocalWriteAddr *= numVgprMultiplierA + self.states.a.numVgprLocalWriteAddrTailLoop *= numVgprMultiplierA + + self.states.b.numVgprLocalReadAddr *= numVgprMultiplierB + self.states.b.numVgprLocalWriteAddr *= numVgprMultiplierB + self.states.b.numVgprLocalWriteAddrTailLoop *= numVgprMultiplierB + + self.states.m.numVgprLocalReadAddr *= numVgprMultiplierMetadata + self.states.m.numVgprLocalWriteAddr *= numVgprMultiplierMetadata + if not (kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]): + self.states.m.numVgprLocalReadAddr = 0 - maxLDSConstOffset = self.states.regCaps["maxLDSConstOffset"] - if self.states.archCaps["DeviceLDS"] > maxLDSConstOffset: - hasMultipleBuffer = kernel["ExpandPointerSwap"] and not kernel["1LDSBuffer"] and not kernel["StoreSwapAddr"] - maxOffsetA = kernel["LdsNumElementsAlignedA"] - maxOffsetB = kernel["LdsNumElementsAlignedB"] - maxOffsetMXSA = kernel["LdsNumElementsAlignedMXSA"] - maxOffsetMXSB = kernel["LdsNumElementsAlignedMXSB"] - maxOffsetMetadata = kernel["LdsNumElementsAlignedMetadata"] - maxOffsetMetadata = kernel["LdsNumElementsAlignedMetadata"] - if hasMultipleBuffer: - maxOffsetA += kernel["LdsOffsetA_Blk"] - maxOffsetB += kernel["LdsOffsetA_Blk"] - maxOffsetMXSA += kernel["LdsOffsetA_Blk"] - maxOffsetMXSB += kernel["LdsOffsetA_Blk"] - maxOffsetMetadata += kernel["LdsOffsetA_Blk"] - - numVgprMultiplierA = maxOffsetA // maxLDSConstOffset + 1 - numVgprMultiplierB = maxOffsetB // maxLDSConstOffset + 1 - numVgprMultiplierMXSA = maxOffsetMXSA // maxLDSConstOffset + 1 - numVgprMultiplierMXSB = maxOffsetMXSB // maxLDSConstOffset + 1 - numVgprMultiplierMetadata = maxOffsetMetadata // maxLDSConstOffset + 1 - - self.states.a.numVgprLocalReadAddr *= numVgprMultiplierA - self.states.a.numVgprLocalWriteAddr *= numVgprMultiplierA - self.states.a.numVgprLocalWriteAddrTailLoop *= numVgprMultiplierA - - self.states.b.numVgprLocalReadAddr *= numVgprMultiplierB - self.states.b.numVgprLocalWriteAddr *= numVgprMultiplierB - self.states.b.numVgprLocalWriteAddrTailLoop *= numVgprMultiplierB - - self.states.m.numVgprLocalReadAddr *= numVgprMultiplierMetadata - self.states.m.numVgprLocalWriteAddr *= numVgprMultiplierMetadata - if not (kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]): - self.states.m.numVgprLocalReadAddr = 0 - - if kernel["ProblemType"]["MXBlockA"]: - self.states.mxsa.numVgprLocalReadAddr *= numVgprMultiplierMXSA - self.states.mxsa.numVgprLocalWriteAddr *= numVgprMultiplierMXSA - self.states.mxsa.numVgprLocalWriteAddrTailLoop *= numVgprMultiplierMXSA - - if kernel["ProblemType"]["MXBlockB"]: - self.states.mxsb.numVgprLocalReadAddr *= numVgprMultiplierMXSB - self.states.mxsb.numVgprLocalWriteAddr *= numVgprMultiplierMXSB - self.states.mxsb.numVgprLocalWriteAddrTailLoop *= numVgprMultiplierMXSB - - # do not allocate local read address register if DirectToVgpr is enabled - if kernel["DirectToVgprA"]: - self.states.a.numVgprLocalReadAddr = 0 - if kernel["ProblemType"]["MXBlockA"] and kernel["DirectToVgprMXSA"]: - self.states.mxsa.numVgprLocalReadAddr = 0 - self.states.mxsa.numVgprLocalWriteAddr = 0 - if kernel["DirectToVgprB"]: - self.states.b.numVgprLocalReadAddr = 0 - if kernel["ProblemType"]["MXBlockB"] and kernel["DirectToVgprMXSB"]: - self.states.mxsb.numVgprLocalReadAddr = 0 - self.states.mxsb.numVgprLocalWriteAddr = 0 - if not (kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]): - self.states.m.numVgprLocalWriteAddr = 0 - - if kernel["ProblemType"]["MXBlockA"] and kernel["DirectToVgprMXSA"]: - self.states.mxsa.numVgprLocalReadAddr = 0 - if kernel["ProblemType"]["MXBlockB"] and kernel["DirectToVgprMXSB"]: - self.states.mxsb.numVgprLocalReadAddr = 0 - - - # do not allocate local write address register if DirectToVgpr is enabled - if kernel["DirectToVgprA"] or kernel["DirectToLdsA"]: - self.states.a.numVgprLocalWriteAddr = 0 - if kernel["DirectToVgprB"] or kernel["DirectToLdsB"]: - self.states.b.numVgprLocalWriteAddr = 0 - - if kernel["ProblemType"]["MXBlockA"] and (kernel["DirectToVgprMXSA"] or kernel["DirectToLdsMXSA"]): - self.states.mxsa.numVgprLocalWriteAddr = 0 - if kernel["ProblemType"]["MXBlockB"] and (kernel["DirectToVgprMXSB"] or kernel["DirectToLdsMXSB"]): - self.states.mxsb.numVgprLocalWriteAddr = 0 - - if kernel["StoreSwapAddr"]: - if self.states.a.numVgprLocalReadAddr > 0: - self.states.a.numVgprLocalReadSwapAddr = 1 - if self.states.b.numVgprLocalReadAddr > 0: - self.states.b.numVgprLocalReadSwapAddr = 1 - if self.states.m.numVgprLocalReadAddr > 0: - self.states.m.numVgprLocalReadSwapAddr = 1 - if kernel["ProblemType"]["MXBlockA"] and (self.states.mxsa.numVgprLocalReadAddr > 0): - self.states.mxsa.numVgprLocalReadSwapAddr = 1 - if kernel["ProblemType"]["MXBlockB"] and (self.states.mxsb.numVgprLocalReadAddr > 0): - self.states.mxsb.numVgprLocalReadSwapAddr = 1 - - if not kernel["LocalWriteUseSgprA"] and (self.states.a.numVgprLocalWriteAddr > 0): - self.states.a.numVgprLocalWriteSwapAddr = 1 - if not kernel["LocalWriteUseSgprB"] and (self.states.b.numVgprLocalWriteAddr > 0): - self.states.b.numVgprLocalWriteSwapAddr = 1 - if kernel["ProblemType"]["Sparse"] and (not kernel["LocalWriteUseSgprMetadata"]) and (self.states.m.numVgprLocalWriteAddr > 0): - self.states.m.numVgprLocalWriteSwapAddr = 1 - if kernel["ProblemType"]["MXBlockA"] and (not kernel["LocalWriteUseSgprMXSA"]) and (self.states.mxsa.numVgprLocalWriteAddr > 0): - self.states.mxsa.numVgprLocalWriteSwapAddr = 1 - if kernel["ProblemType"]["MXBlockB"] and (not kernel["LocalWriteUseSgprMXSB"]) and (self.states.mxsb.numVgprLocalWriteAddr > 0): - self.states.mxsb.numVgprLocalWriteSwapAddr = 1 - - #################################### - # num vgprs: global read addresses A - numGlobalReadsA = kernel["NumLoadsCoalescedA"] \ - * kernel["NumLoadsPerpendicularA"] * kernel["GlobalReadVectorWidthA"] - numGlobalReadInstructionsA = int(numGlobalReadsA * tensorParametersA["bpeGR"])//\ - (tensorParametersA["globalReadInstruction"].blockWidth * 4) - - if kernel["enableTDMA"]: - self.states.a.numVgprGlobalReadOffsets = 0 - elif kernel["BufferLoad"]: - self.states.a.numVgprGlobalReadOffsets = roundUp(numGlobalReadInstructionsA * self.states.rpgo) - else: - numVgprGlobalReadAddressesA = numGlobalReadInstructionsA * self.states.rpga + if kernel["ProblemType"]["MXBlockA"]: + self.states.mxsa.numVgprLocalReadAddr *= numVgprMultiplierMXSA + self.states.mxsa.numVgprLocalWriteAddr *= numVgprMultiplierMXSA + self.states.mxsa.numVgprLocalWriteAddrTailLoop *= numVgprMultiplierMXSA - if self.states.globalReadIncsUseVgpr: - numVgprGlobalReadIncsA = kernel["ProblemType"]["NumIndicesSummation"] \ - * self.states.rpga - else: - numVgprGlobalReadIncsA = 0 + if kernel["ProblemType"]["MXBlockB"]: + self.states.mxsb.numVgprLocalReadAddr *= numVgprMultiplierMXSB + self.states.mxsb.numVgprLocalWriteAddr *= numVgprMultiplierMXSB + self.states.mxsb.numVgprLocalWriteAddrTailLoop *= numVgprMultiplierMXSB + + # do not allocate local read address register if DirectToVgpr is enabled + if kernel["DirectToVgprA"]: + self.states.a.numVgprLocalReadAddr = 0 + if kernel["ProblemType"]["MXBlockA"] and kernel["DirectToVgprMXSA"]: + self.states.mxsa.numVgprLocalReadAddr = 0 + self.states.mxsa.numVgprLocalWriteAddr = 0 + if kernel["DirectToVgprB"]: + self.states.b.numVgprLocalReadAddr = 0 + if kernel["ProblemType"]["MXBlockB"] and kernel["DirectToVgprMXSB"]: + self.states.mxsb.numVgprLocalReadAddr = 0 + self.states.mxsb.numVgprLocalWriteAddr = 0 + if not (kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]): + self.states.m.numVgprLocalWriteAddr = 0 + + if kernel["ProblemType"]["MXBlockA"] and kernel["DirectToVgprMXSA"]: + self.states.mxsa.numVgprLocalReadAddr = 0 + if kernel["ProblemType"]["MXBlockB"] and kernel["DirectToVgprMXSB"]: + self.states.mxsb.numVgprLocalReadAddr = 0 + + + # do not allocate local write address register if DirectToVgpr is enabled + if kernel["DirectToVgprA"] or kernel["DirectToLdsA"]: + self.states.a.numVgprLocalWriteAddr = 0 + if kernel["DirectToVgprB"] or kernel["DirectToLdsB"]: + self.states.b.numVgprLocalWriteAddr = 0 + + if kernel["ProblemType"]["MXBlockA"] and (kernel["DirectToVgprMXSA"] or kernel["DirectToLdsMXSA"]): + self.states.mxsa.numVgprLocalWriteAddr = 0 + if kernel["ProblemType"]["MXBlockB"] and (kernel["DirectToVgprMXSB"] or kernel["DirectToLdsMXSB"]): + self.states.mxsb.numVgprLocalWriteAddr = 0 + + if kernel["StoreSwapAddr"]: + if self.states.a.numVgprLocalReadAddr > 0: + self.states.a.numVgprLocalReadSwapAddr = 1 + if self.states.b.numVgprLocalReadAddr > 0: + self.states.b.numVgprLocalReadSwapAddr = 1 + if self.states.m.numVgprLocalReadAddr > 0: + self.states.m.numVgprLocalReadSwapAddr = 1 + if kernel["ProblemType"]["MXBlockA"] and (self.states.mxsa.numVgprLocalReadAddr > 0): + self.states.mxsa.numVgprLocalReadSwapAddr = 1 + if kernel["ProblemType"]["MXBlockB"] and (self.states.mxsb.numVgprLocalReadAddr > 0): + self.states.mxsb.numVgprLocalReadSwapAddr = 1 + + if not kernel["LocalWriteUseSgprA"] and (self.states.a.numVgprLocalWriteAddr > 0): + self.states.a.numVgprLocalWriteSwapAddr = 1 + if not kernel["LocalWriteUseSgprB"] and (self.states.b.numVgprLocalWriteAddr > 0): + self.states.b.numVgprLocalWriteSwapAddr = 1 + if kernel["ProblemType"]["Sparse"] and (not kernel["LocalWriteUseSgprMetadata"]) and (self.states.m.numVgprLocalWriteAddr > 0): + self.states.m.numVgprLocalWriteSwapAddr = 1 + if kernel["ProblemType"]["MXBlockA"] and (not kernel["LocalWriteUseSgprMXSA"]) and (self.states.mxsa.numVgprLocalWriteAddr > 0): + self.states.mxsa.numVgprLocalWriteSwapAddr = 1 + if kernel["ProblemType"]["MXBlockB"] and (not kernel["LocalWriteUseSgprMXSB"]) and (self.states.mxsb.numVgprLocalWriteAddr > 0): + self.states.mxsb.numVgprLocalWriteSwapAddr = 1 - # num vgprs: global read addresses MXSA - if kernel["ProblemType"]["MXBlockA"]: - numGlobalReadsMXSA = kernel["NumLoadsCoalescedMXSA"] \ - * kernel["NumLoadsPerpendicularMXSA"] * kernel["GlobalReadVectorWidthMXSA"] - numGlobalReadInstructionsMXSA = int(numGlobalReadsMXSA / \ - (tensorParametersMXSA["globalReadInstruction"].blockWidth * 4)) + #################################### + # num vgprs: global read addresses A + numGlobalReadsA = kernel["NumLoadsCoalescedA"] \ + * kernel["NumLoadsPerpendicularA"] * kernel["GlobalReadVectorWidthA"] + numGlobalReadInstructionsA = int(numGlobalReadsA * tensorParametersA["bpeGR"])//\ + (tensorParametersA["globalReadInstruction"].blockWidth * 4) if kernel["enableTDMA"]: - self.states.mxsa.numVgprGlobalReadOffsets = 0 + self.states.a.numVgprGlobalReadOffsets = 0 elif kernel["BufferLoad"]: - self.states.mxsa.numVgprGlobalReadOffsets = roundUp(numGlobalReadInstructionsMXSA * self.states.rpgo) + self.states.a.numVgprGlobalReadOffsets = roundUp(numGlobalReadInstructionsA * self.states.rpgo) else: - numVgprGlobalReadAddressesMXSA = numGlobalReadInstructionsMXSA * self.states.rpga + numVgprGlobalReadAddressesA = numGlobalReadInstructionsA * self.states.rpga if self.states.globalReadIncsUseVgpr: - numVgprGlobalReadIncsMXSA = kernel["ProblemType"]["NumIndicesSummation"] \ + numVgprGlobalReadIncsA = kernel["ProblemType"]["NumIndicesSummation"] \ * self.states.rpga else: - numVgprGlobalReadIncsMXSA = 0 - - # num vgprs: global read addresses B - numGlobalReadsB = kernel["NumLoadsCoalescedB"] \ - * kernel["NumLoadsPerpendicularB"] * kernel["GlobalReadVectorWidthB"] - numGlobalReadInstructionsB = int(numGlobalReadsB * tensorParametersB["bpeGR"])// \ - (tensorParametersB["globalReadInstruction"].blockWidth * 4) - - if kernel["enableTDMB"]: - self.states.b.numVgprGlobalReadOffsets = 0 - elif kernel["BufferLoad"]: - self.states.b.numVgprGlobalReadOffsets = roundUp(numGlobalReadInstructionsB * self.states.rpgo) - else: - numVgprGlobalReadAddressesB = numGlobalReadInstructionsB * self.states.rpga + numVgprGlobalReadIncsA = 0 - if self.states.globalReadIncsUseVgpr: - numVgprGlobalReadIncsB = kernel["ProblemType"]["NumIndicesSummation"] \ - * self.states.rpga - else: - numVgprGlobalReadIncsB = 0 + # num vgprs: global read addresses MXSA + if kernel["ProblemType"]["MXBlockA"]: + numGlobalReadsMXSA = kernel["NumLoadsCoalescedMXSA"] \ + * kernel["NumLoadsPerpendicularMXSA"] * kernel["GlobalReadVectorWidthMXSA"] + numGlobalReadInstructionsMXSA = int(numGlobalReadsMXSA / \ + (tensorParametersMXSA["globalReadInstruction"].blockWidth * 4)) - if kernel["ProblemType"]["MXBlockB"]: - numGlobalReadsMXSB = kernel["NumLoadsCoalescedMXSB"] \ - * kernel["NumLoadsPerpendicularMXSB"] * kernel["GlobalReadVectorWidthMXSB"] - numGlobalReadInstructionsMXSB = int(numGlobalReadsMXSB / \ - (tensorParametersMXSB["globalReadInstruction"].blockWidth * 4)) + if kernel["enableTDMA"]: + self.states.mxsa.numVgprGlobalReadOffsets = 0 + elif kernel["BufferLoad"]: + self.states.mxsa.numVgprGlobalReadOffsets = roundUp(numGlobalReadInstructionsMXSA * self.states.rpgo) + else: + numVgprGlobalReadAddressesMXSA = numGlobalReadInstructionsMXSA * self.states.rpga + + if self.states.globalReadIncsUseVgpr: + numVgprGlobalReadIncsMXSA = kernel["ProblemType"]["NumIndicesSummation"] \ + * self.states.rpga + else: + numVgprGlobalReadIncsMXSA = 0 + + # num vgprs: global read addresses B + numGlobalReadsB = kernel["NumLoadsCoalescedB"] \ + * kernel["NumLoadsPerpendicularB"] * kernel["GlobalReadVectorWidthB"] + numGlobalReadInstructionsB = int(numGlobalReadsB * tensorParametersB["bpeGR"])// \ + (tensorParametersB["globalReadInstruction"].blockWidth * 4) if kernel["enableTDMB"]: - self.states.mxsb.numVgprGlobalReadOffsets = 0 + self.states.b.numVgprGlobalReadOffsets = 0 elif kernel["BufferLoad"]: - self.states.mxsb.numVgprGlobalReadOffsets = roundUp(numGlobalReadInstructionsMXSB * self.states.rpgo) + self.states.b.numVgprGlobalReadOffsets = roundUp(numGlobalReadInstructionsB * self.states.rpgo) else: - numVgprGlobalReadAddressesMXSB = numGlobalReadInstructionsMXSB * self.states.rpga + numVgprGlobalReadAddressesB = numGlobalReadInstructionsB * self.states.rpga if self.states.globalReadIncsUseVgpr: - numVgprGlobalReadIncsMXSB = kernel["ProblemType"]["NumIndicesSummation"] \ + numVgprGlobalReadIncsB = kernel["ProblemType"]["NumIndicesSummation"] \ * self.states.rpga else: - numVgprGlobalReadIncsMXSB = 0 + numVgprGlobalReadIncsB = 0 - # num vgprs: global read addresses M - if tensorParametersM is not None: - numGlobalReadsMetadata = kernel["NumLoadsCoalescedMetadata"] \ - * kernel["NumLoadsPerpendicularMetadata"] * kernel["GlobalReadVectorWidthMetadata"] - numGlobalReadInstructionsMetadata = int(numGlobalReadsMetadata * tensorParametersM["bpe"])//\ - (tensorParametersM["globalReadInstruction"].blockWidth * 4) - if kernel["BufferLoad"]: - self.states.m.numVgprGlobalReadOffsets = roundUp(numGlobalReadInstructionsMetadata * self.states.rpgo) - if self.states.globalReadIncsUseVgpr: - numVgprGlobalReadIncsMetadata = kernel["ProblemType"]["NumIndicesSummation"] \ - * self.states.rpga - else: - numVgprGlobalReadIncsMetadata = 0 + if kernel["ProblemType"]["MXBlockB"]: + numGlobalReadsMXSB = kernel["NumLoadsCoalescedMXSB"] \ + * kernel["NumLoadsPerpendicularMXSB"] * kernel["GlobalReadVectorWidthMXSB"] + numGlobalReadInstructionsMXSB = int(numGlobalReadsMXSB / \ + (tensorParametersMXSB["globalReadInstruction"].blockWidth * 4)) + + if kernel["enableTDMB"]: + self.states.mxsb.numVgprGlobalReadOffsets = 0 + elif kernel["BufferLoad"]: + self.states.mxsb.numVgprGlobalReadOffsets = roundUp(numGlobalReadInstructionsMXSB * self.states.rpgo) + else: + numVgprGlobalReadAddressesMXSB = numGlobalReadInstructionsMXSB * self.states.rpga + + if self.states.globalReadIncsUseVgpr: + numVgprGlobalReadIncsMXSB = kernel["ProblemType"]["NumIndicesSummation"] \ + * self.states.rpga + else: + numVgprGlobalReadIncsMXSB = 0 + + # num vgprs: global read addresses M + if tensorParametersM is not None: + numGlobalReadsMetadata = kernel["NumLoadsCoalescedMetadata"] \ + * kernel["NumLoadsPerpendicularMetadata"] * kernel["GlobalReadVectorWidthMetadata"] + numGlobalReadInstructionsMetadata = int(numGlobalReadsMetadata * tensorParametersM["bpe"])//\ + (tensorParametersM["globalReadInstruction"].blockWidth * 4) + if kernel["BufferLoad"]: + self.states.m.numVgprGlobalReadOffsets = roundUp(numGlobalReadInstructionsMetadata * self.states.rpgo) + if self.states.globalReadIncsUseVgpr: + numVgprGlobalReadIncsMetadata = kernel["ProblemType"]["NumIndicesSummation"] \ + * self.states.rpga + else: + numVgprGlobalReadIncsMetadata = 0 - def GNLCOInit(tc): - abmatrixinfo = self.states.a - if tc == 'A': + def GNLCOInit(tc): abmatrixinfo = self.states.a - elif tc == 'B': - abmatrixinfo = self.states.b - elif tc == 'MXSA': - abmatrixinfo = self.states.mxsa - elif tc == 'MXSB': - abmatrixinfo = self.states.mxsb - elif tc == 'Metadata': - abmatrixinfo = self.states.m - - if (tc in ("A", "B")) and kernel["DirectToLds%s"%tc] and kernel["UseGeneralizedNLCOne%s"%tc]: - isMixedPrec = (kernel["ProblemType"]["DataTypeA"].numBytes() != kernel["ProblemType"]["DataTypeB"].numBytes()) - lrvw = kernel["LocalReadVectorWidth"] - grvw = kernel["GlobalReadVectorWidth%c"%tc] - bpe = kernel["ProblemType"]["DataType%s"%tc].numBytes() - LdsStride = kernel["VectorWidth%s"%tc] * bpe * kernel["DepthU"] - MinLdsBlockSizePerPad = (kernel[f"GlobalReadVectorWidth%s"%tc] * bpe) * kernel["WavefrontSize"] - isM0PadEnough = LdsStride >= MinLdsBlockSizePerPad - - # Currently only supported for 16b, DTL, TLU=0 and grvw == lrvw - if kernel["ProblemType"]["DataType"].numBytes() == 2 and not isMixedPrec \ - and kernel["ProblemType"]["TLU%s"%tc] == 0 and lrvw == grvw and \ - not isM0PadEnough: - abmatrixinfo.gRDtlSwizzlePerpBlockSize = kernel["VectorWidth%s"%tc] - abmatrixinfo.gRDtlSwizzleParaBlockSize = kernel["MatrixInstK"] // (kernel["LocalReadVectorWidth"]) + if tc == 'A': + abmatrixinfo = self.states.a + elif tc == 'B': + abmatrixinfo = self.states.b + elif tc == 'MXSA': + abmatrixinfo = self.states.mxsa + elif tc == 'MXSB': + abmatrixinfo = self.states.mxsb + elif tc == 'Metadata': + abmatrixinfo = self.states.m + + if (tc in ("A", "B")) and kernel["DirectToLds%s"%tc] and kernel["UseGeneralizedNLCOne%s"%tc]: + isMixedPrec = (kernel["ProblemType"]["DataTypeA"].numBytes() != kernel["ProblemType"]["DataTypeB"].numBytes()) + lrvw = kernel["LocalReadVectorWidth"] + grvw = kernel["GlobalReadVectorWidth%c"%tc] + bpe = kernel["ProblemType"]["DataType%s"%tc].numBytes() + LdsStride = kernel["VectorWidth%s"%tc] * bpe * kernel["DepthU"] + MinLdsBlockSizePerPad = (kernel[f"GlobalReadVectorWidth%s"%tc] * bpe) * kernel["WavefrontSize"] + isM0PadEnough = LdsStride >= MinLdsBlockSizePerPad + + # Currently only supported for 16b, DTL, TLU=0 and grvw == lrvw + if kernel["ProblemType"]["DataType"].numBytes() == 2 and not isMixedPrec \ + and kernel["ProblemType"]["TLU%s"%tc] == 0 and lrvw == grvw and \ + not isM0PadEnough: + abmatrixinfo.gRDtlSwizzlePerpBlockSize = kernel["VectorWidth%s"%tc] + abmatrixinfo.gRDtlSwizzleParaBlockSize = kernel["MatrixInstK"] // (kernel["LocalReadVectorWidth"]) + else: + abmatrixinfo.gRDtlSwizzlePerpBlockSize = 0 + abmatrixinfo.gRDtlSwizzleParaBlockSize = 0 + + ntpl = kernel["NumTotalPackedLoads%s"%tc] + if kernel["ProblemType"]["TLU%s"%tc] == 1 and not kernel["enableLDSTr%s"%tc]: + usePerpPerm = False + elif kernel["ProblemType"]["TLU%s"%tc] == 1 and kernel["enableLDSTr%s"%tc]: + usePerpPerm = (ntpl & (ntpl-1)) == 0 + else: + # Currently only VW=1,2 is supported due to how the local read offset + # is currently computed. Supporting VW=1,2 only required small modifications + # to the offset calc. + # TODO: Add support for VW=4,8, this will require more changes in LR offset + # calculations + usePerpPerm = False if kernel["VectorWidth%s"%tc] > 2 or kernel["ProblemType"]["DataType"].numBytes() == 2 else True + + permBlock = kernel["MatrixInstK"] if kernel["ProblemType"]["TLU%s"%tc] == 1 \ + else kernel["VectorWidth%s"%tc] * kernel["MatrixInstM"] + abmatrixinfo.gNLCPermBlock = permBlock + abmatrixinfo.gNLCPerpStride = min([8, 2**int(math.log(ntpl, 2)), permBlock]) if usePerpPerm else 1 else: + abmatrixinfo.gNLCPerpStride = 1 + abmatrixinfo.gNLCPermBlock = 1 abmatrixinfo.gRDtlSwizzlePerpBlockSize = 0 abmatrixinfo.gRDtlSwizzleParaBlockSize = 0 - ntpl = kernel["NumTotalPackedLoads%s"%tc] - if kernel["ProblemType"]["TLU%s"%tc] == 1 and not kernel["enableLDSTr%s"%tc]: - usePerpPerm = False - elif kernel["ProblemType"]["TLU%s"%tc] == 1 and kernel["enableLDSTr%s"%tc]: - usePerpPerm = (ntpl & (ntpl-1)) == 0 + GNLCOInit('A') + GNLCOInit('B') + GNLCOInit('MXSA') + GNLCOInit('MXSB') + GNLCOInit('Metadata') + + numVgprAddressDbg = self.states.rpga if self.debugConfig.debugKernel else 0 + + #################################### + # num vgprs: c write address + # 1 address where to write first value + # 1 tmp address where to write current value + + #################################### + # VGPR Assignment + #################################### + vgprIdx = 0 + + if bool(kernel["ProblemType"]["MXBlockA"]) ^ bool(kernel["ProblemType"]["MXBlockB"]): + self.states.startMXDummyValuVgpr = vgprIdx + vgprIdx += 2 + + # TODO: alignment hack, figure out a better solution + if kernel["ProblemType"]["MXBlockA"]: + self.states.mxsa.startVgprValu = vgprIdx + vgprIdx += self.states.mxsa.numVgprValu + numVgprValuPackMXSA = 0 + if not kernel["UnrollMajorLDSMXSA"]: + self.states.mxsa.startVgprValuPack = vgprIdx + if self.states.lrvwTileMXSA > 1: + numVgprValuPackMXSA = ceil(kernel["VectorWidthMXSA"] / self.states.bpr) * kernel["MIWaveTileMXSA"] // kernel["VectorWidthMXSA"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadMXSA"] + if self.states.packDTVA: + # pack DTV case, double the number + numVgprValuPackMXSA *= 2 + else: + numVgprValuPackMXSA = self.states.mxsa.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackMXSA * int(4 - 1) + vgprIdx += numVgprValuPackMXSA + self.states.mxsa.startVgprG2L = None + if not kernel["DirectToLdsMXSA"] or self.do["KeepDirectToLdsAlloc"]: + # DirectToVgpr + pack or input conversion case, overlap G2L and ValuPack + if self.states.packDTVA: + self.states.mxsa.startVgprG2L = self.states.mxsa.startVgprValuPack + elif self.states.convDTVA: + self.states.mxsa.startVgprG2L = self.states.mxsa.startVgprValu + # if PGR = True, PAP could be possibly enabled, we move G2LA later to prevent it from being reclaimed + # otherwise, put G2L here since it can overlap valu + if (not kernel["PrefetchGlobalRead"]): # g2l can overlap valu + self.states.mxsa.startVgprG2L = self.states.mxsa.startVgprValu + vgprIdx = self.states.mxsa.startVgprValu \ + + max(self.states.mxsa.numVgprValu + numVgprValuPackMXSA, self.states.mxsa.numVgprG2LAllocated) + + if kernel["ProblemType"]["MXBlockB"]: + # TODO: alignment hack, figure out a better solution + if(self.states.archCaps["VgprBank"]): + residual = (vgprIdx % 4) + if (residual % 2) == 0: + # if 2-aligned bank(bank0 and bank2), move to bank1 or bank3. + vgprIdx += 1 + if kernel["ISA"][:2] == (12, 5): + vgprIdx = ((vgprIdx+1)//2)*2 else: - # Currently only VW=1,2 is supported due to how the local read offset - # is currently computed. Supporting VW=1,2 only required small modifications - # to the offset calc. - # TODO: Add support for VW=4,8, this will require more changes in LR offset - # calculations - usePerpPerm = False if kernel["VectorWidth%s"%tc] > 2 or kernel["ProblemType"]["DataType"].numBytes() == 2 else True - - permBlock = kernel["MatrixInstK"] if kernel["ProblemType"]["TLU%s"%tc] == 1 \ - else kernel["VectorWidth%s"%tc] * kernel["MatrixInstM"] - abmatrixinfo.gNLCPermBlock = permBlock - abmatrixinfo.gNLCPerpStride = min([8, 2**int(math.log(ntpl, 2)), permBlock]) if usePerpPerm else 1 + vgprIdx = ((vgprIdx+1)//2)*2 + + self.states.mxsb.startVgprValu = vgprIdx + vgprIdx += self.states.mxsb.numVgprValu + numVgprValuPackMXSB = 0 + if not kernel["UnrollMajorLDSMXSB"]: + self.states.mxsb.startVgprValuPack = vgprIdx + if self.states.lrvwTileMXSB > 1: + numVgprValuPackMXSB = ceil(kernel["VectorWidthMXSB"] / self.states.bpr) * kernel["MIWaveTileMXSB"] // kernel["VectorWidthMXSB"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadMXSB"] + if self.states.packDTVB: + # pack DTV case, double the number + numVgprValuPackMXSB *= 2 + else: + numVgprValuPackMXSB = self.states.mxsb.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackMXSB * int(4 - 1) + vgprIdx += numVgprValuPackMXSB + self.states.mxsb.startVgprG2L = None + if not kernel["DirectToLdsMXSB"] or self.do["KeepDirectToLdsAlloc"]: + # DirectToVgpr + pack or input conversion case, overlap G2L and ValuPack + if self.states.packDTVB: + self.states.mxsb.startVgprG2L = self.states.mxsb.startVgprValuPack + elif self.states.convDTVB: + self.states.mxsb.startVgprG2L = self.states.mxsb.startVgprValu + # if PGR = True, PAP could be possibly enabled, we move G2LA later to prevent it from being reclaimed + # otherwise, put G2L here since it can overlap valu + if (not kernel["PrefetchGlobalRead"]): # g2l can overlap valu + self.states.mxsb.startVgprG2L = self.states.mxsb.startVgprValu + vgprIdx = self.states.mxsb.startVgprValu \ + + max(self.states.mxsb.numVgprValu + numVgprValuPackMXSB, self.states.mxsb.numVgprG2LAllocated) + + if kernel["ProblemType"]["MXBlockA"]: + if self.states.mxsa.startVgprG2L is None and self.states.mxsa.numVgprG2LAllocated > 0: + # TODO: alignment hack, figure out a better solution + vgprIdx = ((vgprIdx+1)//2)*2 + self.states.mxsa.startVgprG2L = vgprIdx; + if ("ULSGRODoubleG2L" in kernel) and kernel["ULSGRODoubleG2L"] == 1: + vgprIdx += self.states.mxsa.numVgprG2LAllocated * 2 + else: + vgprIdx += self.states.mxsa.numVgprG2LAllocated + + if kernel["ProblemType"]["MXBlockB"]: + if self.states.mxsb.startVgprG2L is None and self.states.mxsb.numVgprG2LAllocated > 0: + # TODO: alignment hack, figure out a better solution + vgprIdx = ((vgprIdx+1)//2)*2 + self.states.mxsb.startVgprG2L = vgprIdx; + if ("ULSGRODoubleG2L" in kernel) and kernel["ULSGRODoubleG2L"] == 1: + vgprIdx += self.states.mxsb.numVgprG2LAllocated * 2 + else: + vgprIdx += self.states.mxsb.numVgprG2LAllocated + + vgprIdx = (vgprIdx+1)//2*2 + self.states.lastValuMXSAB = vgprIdx + + self.states.totalAgprs = 0 + self.states.totalMixedAgprs = 0 + self.states.maxLimitAgprs = self.states.regCaps["PhysicalMaxVgpr"] - self.states.regCaps["MaxVgpr"] + self.states.c.startVgprValu = vgprIdx; vgprIdx += self.states.c.numVgprValu + + if kernel["EnableMatrixInstruction"]: + # MI kernels can overlap C-tile w/ AB-tile up until writeback. Illustrated below: + # |<-------------- valuC -------------->|<-->| + # |------------|-----------|------------|----| + # lastValuAB ^ ^ ^ ^ (ValuA, ValuB) + # lastVgprForReads ^ ^ ^ (localWriteAddr, globalReadOffser, G2L, localReadAddr) + # lastValuC ^ ^ (ValuC) + # vgprForStore ^ (other vgpr used in store section) + self.states.serializedStore = True + + ######################################## + # AGPR Allocation + ######################################## + if not kernel["MIArchVgpr"]: + self.states.totalAgprs = self.states.c.numVgprValu + if self.states.totalAgprs > self.states.maxLimitAgprs: + self.states.totalMixedAgprs = self.states.totalAgprs - self.states.maxLimitAgprs + self.states.totalAgprs = self.states.maxLimitAgprs + vgprIdx = self.states.c.startVgprValu + self.states.totalMixedAgprs + self.states.c.numVgprValu = self.states.totalMixedAgprs + + #---------------------------------- + # Move to the front and bypass to tail loop + self.states.startVgprMisc = vgprIdx + + # BufferLoad: + # Uses a resource descriptor (SRD) which is stored in 4 SGPRs and thus shared by all work-items. + # Each work-item also uses a unique 32-bit offset into vgprGlobalReadOffset. These offsets are set when + # the tile is initialized and stay constant through the execution of the kernel. + # The base address in the SRD is updated when the algorithm moves to a new tile + # BufferLoad disables the gptGlobalReadAddr used in flat addressing. + if kernel["BufferLoad"]: + self.startVgprGlobalReadOffsetA = vgprIdx + vgprIdx += 1 if kernel["_UseSgprForGRO"] else self.states.a.numVgprGlobalReadOffsets + self.startVgprGlobalReadOffsetB = vgprIdx + vgprIdx += 1 if kernel["_UseSgprForGRO"] else self.states.b.numVgprGlobalReadOffsets + if kernel["ProblemType"]["MXBlockA"]: + self.startVgprGlobalReadOffsetMXSA = vgprIdx + vgprIdx += 1 if kernel["_UseSgprForGRO"] else self.states.mxsa.numVgprGlobalReadOffsets + if kernel["ProblemType"]["MXBlockB"]: + self.startVgprGlobalReadOffsetMXSB = vgprIdx + vgprIdx += 1 if kernel["_UseSgprForGRO"] else self.states.mxsb.numVgprGlobalReadOffsets + if kernel["ProblemType"]["Sparse"]: + self.startVgprGlobalReadOffsetMetadata = vgprIdx + if kernel["DirectToVgprSparseMetadata"]: + miWaveTile = kernel["MIWaveTileB"] if kernel["ProblemType"]["Sparse"] == 2 else kernel["MIWaveTileA"] + vgprIdx += miWaveTile + else: + vgprIdx += 1 if kernel["_UseSgprForGRO"] else self.states.m.numVgprGlobalReadOffsets else: - abmatrixinfo.gNLCPerpStride = 1 - abmatrixinfo.gNLCPermBlock = 1 - abmatrixinfo.gRDtlSwizzlePerpBlockSize = 0 - abmatrixinfo.gRDtlSwizzleParaBlockSize = 0 + # TODO: alignment hack, figure out a better solution + vgprIdx = ((vgprIdx+1)//2)*2 + self.startVgprGlobalReadAddressesA = vgprIdx + vgprIdx += numVgprGlobalReadAddressesA + self.startVgprGlobalReadAddressesB = vgprIdx + vgprIdx += numVgprGlobalReadAddressesB + if kernel["ProblemType"]["MXBlockA"]: + self.startVgprGlobalReadAddressesMXSA = vgprIdx + vgprIdx += numVgprGlobalReadAddressesMXSA + if kernel["ProblemType"]["MXBlockB"]: + self.startVgprGlobalReadAddressesMXSB = vgprIdx + vgprIdx += numVgprGlobalReadAddressesMXSB - GNLCOInit('A') - GNLCOInit('B') - GNLCOInit('MXSA') - GNLCOInit('MXSB') - GNLCOInit('Metadata') + if not kernel["LocalWriteUseSgprA"]: + self.states.a.startVgprLocalWriteAddr = vgprIdx + vgprIdx += self.states.a.numVgprLocalWriteAddr - numVgprAddressDbg = self.states.rpga if self.debugConfig.debugKernel else 0 + if not kernel["LocalWriteUseSgprB"]: + self.states.b.startVgprLocalWriteAddr = vgprIdx + vgprIdx += self.states.b.numVgprLocalWriteAddr - #################################### - # num vgprs: c write address - # 1 address where to write first value - # 1 tmp address where to write current value + if kernel["ProblemType"]["MXBlockA"]: + if not kernel["LocalWriteUseSgprMXSA"]: + self.states.mxsa.startVgprLocalWriteAddr = vgprIdx + vgprIdx += self.states.mxsa.numVgprLocalWriteAddr - #################################### - # VGPR Assignment - #################################### - vgprIdx = 0 + if kernel["ProblemType"]["MXBlockB"]: + if not kernel["LocalWriteUseSgprMXSB"]: + self.states.mxsb.startVgprLocalWriteAddr = vgprIdx + vgprIdx += self.states.mxsb.numVgprLocalWriteAddr + + if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: + if self.states.combineLocalAddresses: + self.states.m.startVgprLocalWriteAddr = self.states.m.startVgprLocalReadAddr + else: + self.states.m.startVgprLocalWriteAddr = vgprIdx + vgprIdx += self.states.m.numVgprLocalWriteAddr - if bool(kernel["ProblemType"]["MXBlockA"]) ^ bool(kernel["ProblemType"]["MXBlockB"]): - self.states.startMXDummyValuVgpr = vgprIdx - vgprIdx += 2 + self.startVgprGlobalReadIncsA = vgprIdx + vgprIdx += numVgprGlobalReadIncsA + self.startVgprGlobalReadIncsB = vgprIdx + vgprIdx += numVgprGlobalReadIncsB + if kernel["ProblemType"]["MXBlockA"]: + self.startVgprGlobalReadIncsMXSA = vgprIdx + vgprIdx += numVgprGlobalReadIncsMXSA + if kernel["ProblemType"]["MXBlockB"]: + self.startVgprGlobalReadIncsMXSB = vgprIdx + vgprIdx += numVgprGlobalReadIncsMXSB + if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: + self.startVgprGlobalReadIncsMetadata = vgprIdx + vgprIdx += numVgprGlobalReadIncsMetadata - # TODO: alignment hack, figure out a better solution - if kernel["ProblemType"]["MXBlockA"]: - self.states.mxsa.startVgprValu = vgprIdx - vgprIdx += self.states.mxsa.numVgprValu - numVgprValuPackMXSA = 0 - if not kernel["UnrollMajorLDSMXSA"]: - self.states.mxsa.startVgprValuPack = vgprIdx - if self.states.lrvwTileMXSA > 1: - numVgprValuPackMXSA = ceil(kernel["VectorWidthMXSA"] / self.states.bpr) * kernel["MIWaveTileMXSA"] // kernel["VectorWidthMXSA"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadMXSA"] + if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: + self.states.m.startVgprLocalReadAddr = vgprIdx + vgprIdx += self.states.m.numVgprLocalReadAddr + if kernel["ProblemType"]["MXBlockA"]: + self.states.mxsa.startVgprLocalReadAddr = vgprIdx + vgprIdx += self.states.mxsa.numVgprLocalReadAddr + if kernel["ProblemType"]["MXBlockB"]: + self.states.mxsb.startVgprLocalReadAddr = vgprIdx + vgprIdx += self.states.mxsb.numVgprLocalReadAddr + self.states.a.startVgprLocalReadAddr = vgprIdx + vgprIdx += self.states.a.numVgprLocalReadAddr + self.states.b.startVgprLocalReadAddr = vgprIdx + vgprIdx += self.states.b.numVgprLocalReadAddr + + # ---------------------------- + # TODO: alignment hack, figure out a better solution + boolMoveLocalReadAddrA = False + boolMoveLocalReadAddrB = False + if (vgprIdx % 2) == 1: + if (self.states.a.numVgprLocalReadAddr % 2) == 1: + boolMoveLocalReadAddrA = True + self.states.b.startVgprLocalReadAddr -= self.states.a.numVgprLocalReadAddr + vgprIdx -= self.states.a.numVgprLocalReadAddr + elif (self.states.b.numVgprLocalReadAddr % 2) == 1: + boolMoveLocalReadAddrB = True + vgprIdx -= self.states.b.numVgprLocalReadAddr + + if self.states.IncLdsBufSwitch: + # Need backup for the first LocalReadAddr only (others will be calculated from the first one) + self.states.a.startVgprLocalReadAddrOrig = vgprIdx + vgprIdx += 1 if self.states.a.numVgprLocalReadAddr > 0 else 0 + if kernel["ProblemType"]["MXBlockA"]: + self.states.mxsa.startVgprLocalReadAddrOrig = vgprIdx + vgprIdx += 1 if self.states.mxsa.numVgprLocalReadAddr > 0 else 0 + self.states.b.startVgprLocalReadAddrOrig = vgprIdx + vgprIdx += 1 if self.states.b.numVgprLocalReadAddr > 0 else 0 + if kernel["ProblemType"]["MXBlockB"]: + self.states.mxsb.startVgprLocalReadAddrOrig = vgprIdx + vgprIdx += 1 if self.states.mxsb.numVgprLocalReadAddr > 0 else 0 + + # ---------------------------- + # TODO: alignment hack, figure out a better solution + vgprIdx = ((vgprIdx+1)//2)*2 + # Avoid bank conflict between VgprA and VgprC + if(self.states.archCaps["VgprBank"]): + if (self.states.c.startVgprValu % 4) != (vgprIdx % 4): + vgprIdx += 2 + # dot2: alignment hack for wider local read + if kernel["UseDotInstruction"] and kernel["InnerUnroll"] > 1: + vgprIdx = ((vgprIdx+3)//4)*4 + + self.states.startVgpr = vgprIdx + + self.states.a.startVgprValu = vgprIdx + vgprIdx += self.states.a.numVgprValu + + numVgprValuPackA = 0 + if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not kernel["enableLDSTrA"]: + self.states.a.startVgprValuPack = vgprIdx + if self.states.lrvwTileA > 1: + numVgprValuPackA = ceil(kernel["VectorWidthA"] * tensorParametersA["bpe"] / self.states.bpr) * kernel["MIWaveTileA"] // kernel["VectorWidthA"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadA"] if self.states.packDTVA: # pack DTV case, double the number - numVgprValuPackMXSA *= 2 + numVgprValuPackA *= 2 + elif (kernel["UsePLRPack"] and self.states.numItersPLR): + numVgprValuPackA //= 2 else: - numVgprValuPackMXSA = self.states.mxsa.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackMXSA * int(4 - 1) - vgprIdx += numVgprValuPackMXSA - self.states.mxsa.startVgprG2L = None - if not kernel["DirectToLdsMXSA"] or self.do["KeepDirectToLdsAlloc"]: + numVgprValuPackA = self.states.a.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackA * (int(4/tensorParametersA["bpeDS"]) - 1) + vgprIdx += numVgprValuPackA + self.states.a.startVgprG2L = None + if (not kernel["DirectToLdsA"] or self.do["KeepDirectToLdsAlloc"]) and not kernel["enableTDMA"]: # DirectToVgpr + pack or input conversion case, overlap G2L and ValuPack if self.states.packDTVA: - self.states.mxsa.startVgprG2L = self.states.mxsa.startVgprValuPack + self.states.a.startVgprG2L = self.states.a.startVgprValuPack elif self.states.convDTVA: - self.states.mxsa.startVgprG2L = self.states.mxsa.startVgprValu + self.states.a.startVgprG2L = self.states.a.startVgprValu # if PGR = True, PAP could be possibly enabled, we move G2LA later to prevent it from being reclaimed # otherwise, put G2L here since it can overlap valu if (not kernel["PrefetchGlobalRead"]): # g2l can overlap valu - self.states.mxsa.startVgprG2L = self.states.mxsa.startVgprValu - vgprIdx = self.states.mxsa.startVgprValu \ - + max(self.states.mxsa.numVgprValu + numVgprValuPackMXSA, self.states.mxsa.numVgprG2LAllocated) + self.states.a.startVgprG2L = self.states.a.startVgprValu + vgprIdx = self.states.a.startVgprValu \ + + max(self.states.a.numVgprValu + numVgprValuPackA, self.states.a.numVgprG2LAllocated) - if kernel["ProblemType"]["MXBlockB"]: # TODO: alignment hack, figure out a better solution if(self.states.archCaps["VgprBank"]): residual = (vgprIdx % 4) @@ -6890,636 +7556,420 @@ def GNLCOInit(tc): else: vgprIdx = ((vgprIdx+1)//2)*2 - self.states.mxsb.startVgprValu = vgprIdx - vgprIdx += self.states.mxsb.numVgprValu - numVgprValuPackMXSB = 0 - if not kernel["UnrollMajorLDSMXSB"]: - self.states.mxsb.startVgprValuPack = vgprIdx - if self.states.lrvwTileMXSB > 1: - numVgprValuPackMXSB = ceil(kernel["VectorWidthMXSB"] / self.states.bpr) * kernel["MIWaveTileMXSB"] // kernel["VectorWidthMXSB"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadMXSB"] + self.states.b.startVgprValu = vgprIdx + vgprIdx += self.states.b.numVgprValu + numVgprValuPackB = 0 + if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"] and not kernel["enableLDSTrB"]: + self.states.b.startVgprValuPack = vgprIdx + if self.states.lrvwTileB > 1: + numVgprValuPackB = ceil(kernel["VectorWidthB"] * tensorParametersB["bpe"] / self.states.bpr) * kernel["MIWaveTileB"] // kernel["VectorWidthB"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadB"] if self.states.packDTVB: # pack DTV case, double the number - numVgprValuPackMXSB *= 2 + numVgprValuPackB *= 2 + elif (kernel["UsePLRPack"] and self.states.numItersPLR): + numVgprValuPackB //= 2 else: - numVgprValuPackMXSB = self.states.mxsb.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackMXSB * int(4 - 1) - vgprIdx += numVgprValuPackMXSB - self.states.mxsb.startVgprG2L = None - if not kernel["DirectToLdsMXSB"] or self.do["KeepDirectToLdsAlloc"]: - # DirectToVgpr + pack or input conversion case, overlap G2L and ValuPack + numVgprValuPackB = self.states.b.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackB * (int(4/tensorParametersB["bpeDS"]) - 1) + vgprIdx += numVgprValuPackB + self.states.b.startVgprG2L = None + if (not kernel["DirectToLdsB"] or self.do["KeepDirectToLdsAlloc"]) and not kernel["enableTDMB"]: + # DirectToVgpr + pack or input conversion case, overlap G2L and ValuPack if self.states.packDTVB: - self.states.mxsb.startVgprG2L = self.states.mxsb.startVgprValuPack + self.states.b.startVgprG2L = self.states.b.startVgprValuPack elif self.states.convDTVB: - self.states.mxsb.startVgprG2L = self.states.mxsb.startVgprValu - # if PGR = True, PAP could be possibly enabled, we move G2LA later to prevent it from being reclaimed + self.states.b.startVgprG2L = self.states.b.startVgprValu + # if PGR = True, PAP could be possibly enabled, we move G2LB later to prevent it from being reclaimed # otherwise, put G2L here since it can overlap valu if (not kernel["PrefetchGlobalRead"]): # g2l can overlap valu - self.states.mxsb.startVgprG2L = self.states.mxsb.startVgprValu - vgprIdx = self.states.mxsb.startVgprValu \ - + max(self.states.mxsb.numVgprValu + numVgprValuPackMXSB, self.states.mxsb.numVgprG2LAllocated) + self.states.b.startVgprG2L = self.states.b.startVgprValu + vgprIdx = self.states.b.startVgprValu \ + + max(self.states.b.numVgprValu + numVgprValuPackB, self.states.b.numVgprG2LAllocated) + + if ((tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"]) or \ + (tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"]) or \ + (kernel["ProblemType"]["Sparse"] and not kernel["UnrollMajorLDSMetadata"] and (kernel["MIInputPerThreadMetadata"] == 4))) \ + and (kernel["ProblemType"]["DataType"].isInt8() or kernel["ProblemType"]["DataType"].is8bitFloat()) or \ + (self.states.asmCaps["HasSWMMAC_gfx1250"] and kernel["ProblemType"]["Sparse"] and not kernel["UnrollMajorLDSMetadata"]): + self.states.a.startVgprValuPackTemp = vgprIdx + self.states.b.startVgprValuPackTemp = vgprIdx + vgprIdx += 1 - if kernel["ProblemType"]["MXBlockA"]: - if self.states.mxsa.startVgprG2L is None and self.states.mxsa.numVgprG2LAllocated > 0: + self.states.a.startVgprValuCvtTemp = -1 + self.states.b.startVgprValuCvtTemp = -1 + if kernel["ConvertAfterDS"]: + if ((tensorParametersA["bpe"] > tensorParametersA["bpeDS"]) and kernel["ProblemType"]["DataTypeA"].is8bitFloat()): + self.states.a.startVgprValuCvtTemp = vgprIdx + if ((tensorParametersB["bpe"] > tensorParametersB["bpeDS"]) and kernel["ProblemType"]["DataTypeB"].is8bitFloat()): + self.states.b.startVgprValuCvtTemp = vgprIdx + if self.states.a.startVgprValuCvtTemp != -1 or self.states.b.startVgprValuCvtTemp != -1: + vgprIdx += 2 + + if kernel["ProblemType"]["Sparse"]: + if kernel["DirectToVgprSparseMetadata"]: + self.states.m.startVgprValu = vgprIdx + vgprIdx += self.states.m.numVgprValu + else: + # TODO: alignment hack, figure out a better solution + vgprIdx = ((vgprIdx+1)//2)*2 + if(self.states.archCaps["VgprBank"]): + vgprIdx += 1 + # gfx1250 + if self.states.m.numVgprValu >= 2: + vgprIdx = ((vgprIdx+1)//2)*2 + self.states.m.startVgprValu = vgprIdx + vgprIdx += self.states.m.numVgprValu + numVgprValuPackMetadata = 0 + if not kernel["UnrollMajorLDSMetadata"] and not kernel["enableLDSTrMetadata"]: + self.states.m.startVgprValuPack = vgprIdx + if self.states.lrvwTileMetadata > 1: + numVgprValuPackMetadata = roundUp(kernel["VectorWidthMetadata"] * tensorParametersM["bpe"] / self.states.bpr) * kernel["MIWaveTileMetadata"] // kernel["VectorWidthMetadata"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadMetadata"] + else: + numVgprValuPackMetadata = (kernel["MIInputPerThreadMetadata"]-1) * kernel["MIWaveTileMetadata"] * kernel["InnerUnroll"] * self.states.numVgprBufferPackMetadata + vgprIdx += numVgprValuPackMetadata + self.states.m.startVgprG2L = None + if not kernel["PrefetchGlobalRead"]: # g2l can overlap valu + self.states.m.startVgprG2L = self.states.m.startVgprValu + vgprIdx = self.states.m.startVgprValu \ + + max(self.states.m.numVgprValu + numVgprValuPackMetadata, self.states.m.numVgprG2LAllocated) + + # Registers allocated above this point can be used as temps during setup + # Registers above here are reserved in initC, near the end of the setup + # code + self.states.lastValuAB = vgprIdx + + #----------- + self.states.firstVgprForReads = vgprIdx + if self.states.a.startVgprG2L is None and self.states.a.numVgprG2LAllocated > 0: # TODO: alignment hack, figure out a better solution vgprIdx = ((vgprIdx+1)//2)*2 - self.states.mxsa.startVgprG2L = vgprIdx; + self.states.a.startVgprG2L = vgprIdx if ("ULSGRODoubleG2L" in kernel) and kernel["ULSGRODoubleG2L"] == 1: - vgprIdx += self.states.mxsa.numVgprG2LAllocated * 2 + vgprIdx += self.states.a.numVgprG2LAllocated*2 else: - vgprIdx += self.states.mxsa.numVgprG2LAllocated + vgprIdx += self.states.a.numVgprG2LAllocated - if kernel["ProblemType"]["MXBlockB"]: - if self.states.mxsb.startVgprG2L is None and self.states.mxsb.numVgprG2LAllocated > 0: + if self.states.b.startVgprG2L is None and self.states.b.numVgprG2LAllocated > 0: # TODO: alignment hack, figure out a better solution vgprIdx = ((vgprIdx+1)//2)*2 - self.states.mxsb.startVgprG2L = vgprIdx; + self.states.b.startVgprG2L = vgprIdx if ("ULSGRODoubleG2L" in kernel) and kernel["ULSGRODoubleG2L"] == 1: - vgprIdx += self.states.mxsb.numVgprG2LAllocated * 2 + vgprIdx += self.states.b.numVgprG2LAllocated*2 else: - vgprIdx += self.states.mxsb.numVgprG2LAllocated - - vgprIdx = (vgprIdx+1)//2*2 - self.states.lastValuMXSAB = vgprIdx - - self.states.totalAgprs = 0 - self.states.totalMixedAgprs = 0 - self.states.maxLimitAgprs = self.states.regCaps["PhysicalMaxVgpr"] - self.states.regCaps["MaxVgpr"] - self.states.c.startVgprValu = vgprIdx; vgprIdx += self.states.c.numVgprValu + vgprIdx += self.states.b.numVgprG2LAllocated - if kernel["EnableMatrixInstruction"]: - # MI kernels can overlap C-tile w/ AB-tile up until writeback. Illustrated below: - # |<-------------- valuC -------------->|<-->| - # |------------|-----------|------------|----| - # lastValuAB ^ ^ ^ ^ (ValuA, ValuB) - # lastVgprForReads ^ ^ ^ (localWriteAddr, globalReadOffser, G2L, localReadAddr) - # lastValuC ^ ^ (ValuC) - # vgprForStore ^ (other vgpr used in store section) - self.states.serializedStore = True - - ######################################## - # AGPR Allocation - ######################################## - if not kernel["MIArchVgpr"]: - self.states.totalAgprs = self.states.c.numVgprValu - if self.states.totalAgprs > self.states.maxLimitAgprs: - self.states.totalMixedAgprs = self.states.totalAgprs - self.states.maxLimitAgprs - self.states.totalAgprs = self.states.maxLimitAgprs - vgprIdx = self.states.c.startVgprValu + self.states.totalMixedAgprs - self.states.c.numVgprValu = self.states.totalMixedAgprs - - #---------------------------------- - # Move to the front and bypass to tail loop - self.states.startVgprMisc = vgprIdx - - # BufferLoad: - # Uses a resource descriptor (SRD) which is stored in 4 SGPRs and thus shared by all work-items. - # Each work-item also uses a unique 32-bit offset into vgprGlobalReadOffset. These offsets are set when - # the tile is initialized and stay constant through the execution of the kernel. - # The base address in the SRD is updated when the algorithm moves to a new tile - # BufferLoad disables the gptGlobalReadAddr used in flat addressing. - if kernel["BufferLoad"]: - self.startVgprGlobalReadOffsetA = vgprIdx - vgprIdx += 1 if kernel["_UseSgprForGRO"] else self.states.a.numVgprGlobalReadOffsets - self.startVgprGlobalReadOffsetB = vgprIdx - vgprIdx += 1 if kernel["_UseSgprForGRO"] else self.states.b.numVgprGlobalReadOffsets - if kernel["ProblemType"]["MXBlockA"]: - self.startVgprGlobalReadOffsetMXSA = vgprIdx - vgprIdx += 1 if kernel["_UseSgprForGRO"] else self.states.mxsa.numVgprGlobalReadOffsets - if kernel["ProblemType"]["MXBlockB"]: - self.startVgprGlobalReadOffsetMXSB = vgprIdx - vgprIdx += 1 if kernel["_UseSgprForGRO"] else self.states.mxsb.numVgprGlobalReadOffsets - if kernel["ProblemType"]["Sparse"]: - self.startVgprGlobalReadOffsetMetadata = vgprIdx - if kernel["DirectToVgprSparseMetadata"]: - miWaveTile = kernel["MIWaveTileB"] if kernel["ProblemType"]["Sparse"] == 2 else kernel["MIWaveTileA"] - vgprIdx += miWaveTile + if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: + if self.states.m.startVgprG2L is None: + # TODO: alignment hack, figure out a better solution + vgprIdx = ((vgprIdx+1)//2)*2 + self.states.m.startVgprG2L = vgprIdx; vgprIdx += self.states.m.numVgprG2LAllocated + + # GlobalRead, LocalWrite, LocalRead, G2L can be reclaimed, extend the "lastVgprForReads" value + self.states.lastVgprForReads = vgprIdx + + if boolMoveLocalReadAddrA: + self.states.a.startVgprLocalReadAddr = vgprIdx + vgprIdx += self.states.a.numVgprLocalReadAddr + elif boolMoveLocalReadAddrB: + self.states.b.startVgprLocalReadAddr = vgprIdx + vgprIdx += self.states.b.numVgprLocalReadAddr + + #----------- + if kernel["ProblemType"]["Gradient"] and kernel["ProblemType"]["UseBias"]: + if kernel["ProblemType"]["BiasSrc"] == "A": + self.states.bias.numVgprValu = kernel["MIWaveTile"][0] + elif kernel["ProblemType"]["BiasSrc"] == "B": + self.states.bias.numVgprValu = kernel["MIWaveTile"][1] else: - vgprIdx += 1 if kernel["_UseSgprForGRO"] else self.states.m.numVgprGlobalReadOffsets - else: - # TODO: alignment hack, figure out a better solution - vgprIdx = ((vgprIdx+1)//2)*2 - self.startVgprGlobalReadAddressesA = vgprIdx - vgprIdx += numVgprGlobalReadAddressesA - self.startVgprGlobalReadAddressesB = vgprIdx - vgprIdx += numVgprGlobalReadAddressesB - if kernel["ProblemType"]["MXBlockA"]: - self.startVgprGlobalReadAddressesMXSA = vgprIdx - vgprIdx += numVgprGlobalReadAddressesMXSA - if kernel["ProblemType"]["MXBlockB"]: - self.startVgprGlobalReadAddressesMXSB = vgprIdx - vgprIdx += numVgprGlobalReadAddressesMXSB - - if not kernel["LocalWriteUseSgprA"]: - self.states.a.startVgprLocalWriteAddr = vgprIdx - vgprIdx += self.states.a.numVgprLocalWriteAddr - - if not kernel["LocalWriteUseSgprB"]: - self.states.b.startVgprLocalWriteAddr = vgprIdx - vgprIdx += self.states.b.numVgprLocalWriteAddr + self.states.bias.numVgprValu = 0 + self.states.bias.numVgprValu *= max(kernel["ProblemType"]["ComputeDataType"].numRegisters(), 1) + else: + self.states.bias.numVgprValu = 0 + self.states.bias.startVgprValu = vgprIdx + vgprIdx += self.states.bias.numVgprValu - if kernel["ProblemType"]["MXBlockA"]: - if not kernel["LocalWriteUseSgprMXSA"]: - self.states.mxsa.startVgprLocalWriteAddr = vgprIdx - vgprIdx += self.states.mxsa.numVgprLocalWriteAddr + #----------- + if kernel["ProblemType"]["OutputAmaxD"]: + self.startVgprAmaxOut = vgprIdx + self.startVgprAmaxOutB = vgprIdx + 1 + vgprIdx += 2 - if kernel["ProblemType"]["MXBlockB"]: - if not kernel["LocalWriteUseSgprMXSB"]: - self.states.mxsb.startVgprLocalWriteAddr = vgprIdx - vgprIdx += self.states.mxsb.numVgprLocalWriteAddr + self.states.startVgprAddressDbg = vgprIdx + vgprIdx += numVgprAddressDbg - if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - if self.states.combineLocalAddresses: - self.states.m.startVgprLocalWriteAddr = self.states.m.startVgprLocalReadAddr - else: - self.states.m.startVgprLocalWriteAddr = vgprIdx - vgprIdx += self.states.m.numVgprLocalWriteAddr + # for cgemm or zgemm + MIAV case, allocate 2 or 4 vgpr for alpha calculation (cannot use tmp vgpr in write batch) + if kernel["ProblemType"]["DataType"].isComplex() \ + and kernel["MIArchVgpr"]: - self.startVgprGlobalReadIncsA = vgprIdx - vgprIdx += numVgprGlobalReadIncsA - self.startVgprGlobalReadIncsB = vgprIdx - vgprIdx += numVgprGlobalReadIncsB - if kernel["ProblemType"]["MXBlockA"]: - self.startVgprGlobalReadIncsMXSA = vgprIdx - vgprIdx += numVgprGlobalReadIncsMXSA - if kernel["ProblemType"]["MXBlockB"]: - self.startVgprGlobalReadIncsMXSB = vgprIdx - vgprIdx += numVgprGlobalReadIncsMXSB - if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - self.startVgprGlobalReadIncsMetadata = vgprIdx - vgprIdx += numVgprGlobalReadIncsMetadata + # need proper alignment + vgprIdx = ((vgprIdx+2 - 1)//2)*2 + self.states.startVgprAlphaTmp = vgprIdx + vgprIdx += kernel["ProblemType"]["DataType"].numRegisters() - if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - self.states.m.startVgprLocalReadAddr = vgprIdx - vgprIdx += self.states.m.numVgprLocalReadAddr - if kernel["ProblemType"]["MXBlockA"]: - self.states.mxsa.startVgprLocalReadAddr = vgprIdx - vgprIdx += self.states.mxsa.numVgprLocalReadAddr - if kernel["ProblemType"]["MXBlockB"]: - self.states.mxsb.startVgprLocalReadAddr = vgprIdx - vgprIdx += self.states.mxsb.numVgprLocalReadAddr - self.states.a.startVgprLocalReadAddr = vgprIdx - vgprIdx += self.states.a.numVgprLocalReadAddr - self.states.b.startVgprLocalReadAddr = vgprIdx - vgprIdx += self.states.b.numVgprLocalReadAddr - - # ---------------------------- - # TODO: alignment hack, figure out a better solution - boolMoveLocalReadAddrA = False - boolMoveLocalReadAddrB = False - if (vgprIdx % 2) == 1: - if (self.states.a.numVgprLocalReadAddr % 2) == 1: - boolMoveLocalReadAddrA = True - self.states.b.startVgprLocalReadAddr -= self.states.a.numVgprLocalReadAddr - vgprIdx -= self.states.a.numVgprLocalReadAddr - elif (self.states.b.numVgprLocalReadAddr % 2) == 1: - boolMoveLocalReadAddrB = True - vgprIdx -= self.states.b.numVgprLocalReadAddr - - if self.states.IncLdsBufSwitch: - # Need backup for the first LocalReadAddr only (others will be calculated from the first one) - self.states.a.startVgprLocalReadAddrOrig = vgprIdx - vgprIdx += 1 if self.states.a.numVgprLocalReadAddr > 0 else 0 + # for swapping vgpr offsets of different lds buffers + if self.states.a.numVgprLocalReadSwapAddr > 0: + self.states.a.startVgprLocalReadSwapAddr = vgprIdx + vgprIdx += 1 + if self.states.mxsa.numVgprLocalReadSwapAddr > 0: + self.states.mxsa.startVgprLocalReadSwapAddr = vgprIdx + vgprIdx += 1 + if self.states.m.numVgprLocalReadSwapAddr > 0: + self.states.m.startVgprLocalReadSwapAddr = vgprIdx + vgprIdx += 1 + if self.states.b.numVgprLocalReadSwapAddr > 0: + self.states.b.startVgprLocalReadSwapAddr = vgprIdx + vgprIdx += 1 + if self.states.mxsb.numVgprLocalReadSwapAddr > 0: + self.states.mxsb.startVgprLocalReadSwapAddr = vgprIdx + vgprIdx += 1 + if self.states.a.numVgprLocalWriteSwapAddr > 0: + self.states.a.startVgprLocalWriteSwapAddr = vgprIdx + vgprIdx += 1 + if self.states.m.numVgprLocalWriteSwapAddr > 0: + self.states.m.startVgprLocalWriteSwapAddr = vgprIdx + vgprIdx += 1 + if self.states.b.numVgprLocalWriteSwapAddr > 0: + self.states.b.startVgprLocalWriteSwapAddr = vgprIdx + vgprIdx += 1 if kernel["ProblemType"]["MXBlockA"]: - self.states.mxsa.startVgprLocalReadAddrOrig = vgprIdx - vgprIdx += 1 if self.states.mxsa.numVgprLocalReadAddr > 0 else 0 - self.states.b.startVgprLocalReadAddrOrig = vgprIdx - vgprIdx += 1 if self.states.b.numVgprLocalReadAddr > 0 else 0 + if self.states.mxsa.numVgprLocalWriteSwapAddr > 0: + self.states.mxsa.startVgprLocalWriteSwapAddr = vgprIdx + vgprIdx += 1 if kernel["ProblemType"]["MXBlockB"]: - self.states.mxsb.startVgprLocalReadAddrOrig = vgprIdx - vgprIdx += 1 if self.states.mxsb.numVgprLocalReadAddr > 0 else 0 - - # ---------------------------- - # TODO: alignment hack, figure out a better solution - vgprIdx = ((vgprIdx+1)//2)*2 - # Avoid bank conflict between VgprA and VgprC - if(self.states.archCaps["VgprBank"]): - if (self.states.c.startVgprValu % 4) != (vgprIdx % 4): - vgprIdx += 2 - # dot2: alignment hack for wider local read - if kernel["UseDotInstruction"] and kernel["InnerUnroll"] > 1: - vgprIdx = ((vgprIdx+3)//4)*4 + if self.states.mxsb.numVgprLocalWriteSwapAddr > 0: + self.states.mxsb.startVgprLocalWriteSwapAddr = vgprIdx + vgprIdx += 1 + + # X32F Emulation initializations + # meaning of variables + # useDirect32XEmulation (separate values for A and B): + # True: allocate extra buffer (either full (tranpose only) or interleave) to eliminate extra v_mov + # False: use temp Treg only for conversion (need some v_mov) + # TF32EmuUseTransposeCode (separate values for A and B. For wider local read(lrvwTile>1) only): + # True: Generate extra transpose code (with v_swap) + # False: Use index tranpose and no tranpose code + # This is for cvt + sub only (means not dot2, not mfma) + # TF32EmuInterleaveTreg: + # True: Allocate T reg with interleaving X regs for dest of local read + # T0-3 + # X4-7 + # T4-7 + # X8-11 + # .... + # This works with useDirect32XEmulation=Trie + # Wider local read case, we need TransposeCode=True + # False: Does not use interleave layout + # ider local read + index transpose case, this needs to be False + def initTF32Emu(): + # for UseF32XEmulation only + if not kernel["UseF32XEmulation"]: + return 0, 0 + self.states.a.useDirect32XEmulationThis = self.states.a.useDirect32XEmulationNext = kernel["UseDirect32XEmulation"] + self.states.b.useDirect32XEmulationThis = self.states.b.useDirect32XEmulationNext = kernel["UseDirect32XEmulation"] + self.states.mxsa.useDirect32XEmulationThis = False + self.states.mxsb.useDirect32XEmulationThis = False + self.states.a.TF32EmuUseTransposeCode = False + self.states.b.TF32EmuUseTransposeCode = False + self.states.mxsa.TF32EmuUseTransposeCode = False + self.states.mxsb.TF32EmuUseTransposeCode = False + self.states.a.TF32EmuInterleaveTreg = False + self.states.b.TF32EmuInterleaveTreg = False + self.states.mxsa.TF32EmuInterleaveTreg = False + self.states.mxsb.TF32EmuInterleaveTreg = False + # do prefetch and scheduling for full pack code + # this sceduling opt is for non CMS. No need to enable it for CMS + self.states.doFullPackCodePrefetch = kernel["UsePLRPack"] and not kernel["UseCustomMainLoopSchedule"] + # prefetch pack/prePack scheduling for non CMS only + # We do not enable any ppack scheduling optimizations for PLR=0 + if (not kernel["UseCustomMainLoopSchedule"]) and self.states.numItersPLR: + # enabhe prepack scheduling for this loop only for DTLA + B + if kernel["DirectToLds"] == 1: + # do packPre scheduling for This loop only not CLR or SubIter + self.states.doPackPreSchedulingThisLoop = (not kernel["ClusterLocalRead"]) or kernel["ForceUnrollSubIter"] + self.states.doPackPreSchedulingNextLoop = True + if self.states.tailloopInNll: + # disable all TF32 scheduling if tailloopInNll is enabled + self.states.doFullPackCodePrefetch = False + self.states.doPackPreSchedulingThisLoop = False + self.states.doPackPreSchedulingNextLoop = False + numVgprsEmuA = initTF32EmuAB(self.states.a, self.states.lrvwTileA) + numVgprsEmuB = initTF32EmuAB(self.states.b, self.states.lrvwTileB) + return numVgprsEmuA, numVgprsEmuB + def initTF32EmuAB(sAorB: ABMatrixInfo, lrvwTile): + # for UseF32XEmulation only + if not kernel["UseF32XEmulation"]: + return 0 + # number of Vreg for interleaveTreg. Half of ValuA or B. Need same block number as Valu + numVForInterleave = sAorB.numVgprValu // 2 + numVForIndexTranspose = sAorB.numVgprValuPerBlock + if kernel["ForceUnrollSubIter"]: + # SubIter case, we devide local read into half at each prefetch + numVForIndexTranspose //= 2 + # full prefetch pack case, we need to allocate full ValuA/B buffers + if self.states.doFullPackCodePrefetch: + if kernel["UseDirect32XEmulationInterleaveTreg"]: + # use conventional Treg allocatin (interleaved Treg and Xreg) + numV = numVForInterleave + # enable TF32EmuInterleaveTreg + sAorB.TF32EmuInterleaveTreg = True + else: + # allocate single full buffer as dest of local read + numV = numVForIndexTranspose + sAorB.useDirect32XEmulationThis = True + sAorB.useDirect32XEmulationNext = True + if kernel["UseMFMAF32XEmulation"]: + # use transpose code for MFMA + sAorB.useTransposeCodeThis = True + sAorB.useTransposeCodeNext = True + return numV + # reg layout setting + # At init stage, seting is same for this and next + if sAorB.useDirect32XEmulationThis: + # enable TF32EmuInterleaveTreg + sAorB.TF32EmuInterleaveTreg = True + numV = numVForInterleave + if lrvwTile > 1: + # useDirect32XEmulation case + # Use wider local read + transpose code + sAorB.useTransposeCodeThis = True + sAorB.useTransposeCodeNext = True + else: + # no useDirect32XEmulation case, use temp reg version + numV = adjustNumVForTF32Emu(sAorB, lrvwTile) + return numV - self.states.startVgpr = vgprIdx + def adjustNumVForTF32Emu(sAorB: ABMatrixInfo, lrvwTile): + # for UseF32XEmulation only + if not kernel["UseF32XEmulation"]: + return 0 + # no T reg for both This and Next Loop + if lrvwTile > 1: + # use tranpose code for wider local read + sAorB.useTransposeCodeThis = True + sAorB.useTransposeCodeNext = True + numV = 0 + # disable TF32EmuInterleaveTreg + sAorB.TF32EmuInterleaveTreg = False + sAorB.useDirect32XEmulationThis = False + sAorB.useDirect32XEmulationNext = False + return numV - self.states.a.startVgprValu = vgprIdx - vgprIdx += self.states.a.numVgprValu + def checkVregOverflowTF32Emu(vgprIdx, numV): + # for UseF32XEmulation only + if not kernel["UseF32XEmulation"]: + return False + # Do not allow adjustment for CMS or doFullPackCodePrefetch + if kernel["UseCustomMainLoopSchedule"] or self.states.doFullPackCodePrefetch: + return False + # We need to consider 2 more vreg (Serial tmp) + # Looks like we need more tmp vreg at tailloop + # So far, max 32 tmp vregs might be used. + # set 2 + 32 as buffer (tentative) + # MFMA case, need 2 more + bufferVregNum = 2 + 32 + if kernel["UseMFMAF32XEmulation"]: + bufferVregNum += 2 + return vgprIdx + bufferVregNum + numV > self.states.regCaps["MaxVgpr"] + + # initial TF32Emu setting + numVgprsEmuA, numVgprsEmuB = initTF32Emu() + # numVreg adjustment + # step 1 Adjustment for lrvwTileA/B==1 + # start from B + needAdjustment = checkVregOverflowTF32Emu(vgprIdx, numVgprsEmuA + numVgprsEmuB) + if needAdjustment and self.states.lrvwTileB == 1: + numVgprsEmuB = adjustNumVForTF32Emu(self.states.b, self.states.lrvwTileB) + # then, check A + needAdjustment = checkVregOverflowTF32Emu(vgprIdx, numVgprsEmuA + numVgprsEmuB) + if needAdjustment and self.states.lrvwTileA == 1 and not self.states.doFullPackCodePrefetch: + numVgprsEmuA = adjustNumVForTF32Emu(self.states.a, self.states.lrvwTileA) + # step 2 Adjustment for lrvwTileA/B>1 + # start from B + needAdjustment = checkVregOverflowTF32Emu(vgprIdx, numVgprsEmuA + numVgprsEmuB) + if needAdjustment and self.states.lrvwTileB > 1 and not self.states.doFullPackCodePrefetch: + numVgprsEmuB = adjustNumVForTF32Emu(self.states.b, self.states.lrvwTileB) + # then, checkA + needAdjustment = checkVregOverflowTF32Emu(vgprIdx, numVgprsEmuA + numVgprsEmuB) + if needAdjustment and self.states.lrvwTileA > 1 and not self.states.doFullPackCodePrefetch: + numVgprsEmuA = adjustNumVForTF32Emu(self.states.a, self.states.lrvwTileA) + # final adjustment + # disable UseMFMAF32XEmulation and save 2 vregs + if kernel["UseMFMAF32XEmulation"]: + if checkVregOverflowTF32Emu(vgprIdx, numVgprsEmuA + numVgprsEmuB): + # disable UseMFMAF32XEmulation and use Dot2 instead + kernel["UseMFMAF32XEmulation"] = False + kernel["UseDot2F32XEmulation"] = True - numVgprValuPackA = 0 - if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not kernel["enableLDSTrA"]: - self.states.a.startVgprValuPack = vgprIdx - if self.states.lrvwTileA > 1: - numVgprValuPackA = ceil(kernel["VectorWidthA"] * tensorParametersA["bpe"] / self.states.bpr) * kernel["MIWaveTileA"] // kernel["VectorWidthA"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadA"] - if self.states.packDTVA: - # pack DTV case, double the number - numVgprValuPackA *= 2 - elif (kernel["UsePLRPack"] and self.states.numItersPLR): - numVgprValuPackA //= 2 - else: - numVgprValuPackA = self.states.a.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackA * (int(4/tensorParametersA["bpeDS"]) - 1) - vgprIdx += numVgprValuPackA - self.states.a.startVgprG2L = None - if (not kernel["DirectToLdsA"] or self.do["KeepDirectToLdsAlloc"]) and not kernel["enableTDMA"]: - # DirectToVgpr + pack or input conversion case, overlap G2L and ValuPack - if self.states.packDTVA: - self.states.a.startVgprG2L = self.states.a.startVgprValuPack - elif self.states.convDTVA: - self.states.a.startVgprG2L = self.states.a.startVgprValu - # if PGR = True, PAP could be possibly enabled, we move G2LA later to prevent it from being reclaimed - # otherwise, put G2L here since it can overlap valu - if (not kernel["PrefetchGlobalRead"]): # g2l can overlap valu - self.states.a.startVgprG2L = self.states.a.startVgprValu - vgprIdx = self.states.a.startVgprValu \ - + max(self.states.a.numVgprValu + numVgprValuPackA, self.states.a.numVgprG2LAllocated) - - # TODO: alignment hack, figure out a better solution - if(self.states.archCaps["VgprBank"]): - residual = (vgprIdx % 4) - if (residual % 2) == 0: - # if 2-aligned bank(bank0 and bank2), move to bank1 or bank3. - vgprIdx += 1 - if kernel["ISA"][:2] == (12, 5): + # vreg allocation for UseMFMAF32XEmulation + if kernel["UseMFMAF32XEmulation"]: + vgprIdx = ((vgprIdx+1)//2)*2 #align 64 bit + self.states.startVgprIdentityMatrix = vgprIdx + vgprIdx+=2 + numVgprsEmu = numVgprsEmuA + numVgprsEmuB + self.states.a.numVgprEmu = numVgprsEmuA + self.states.b.numVgprEmu = numVgprsEmuB + if numVgprsEmu > 0: + #align 64 bit vgprIdx = ((vgprIdx+1)//2)*2 - else: - vgprIdx = ((vgprIdx+1)//2)*2 + self.states.a.startVgprCvt = vgprIdx + vgprIdx += numVgprsEmuA # for vgpr 32XEmulation A + self.states.b.startVgprCvt = vgprIdx + vgprIdx += numVgprsEmuB # for vgpr 32XEmulation B + + if kernel["StreamK"] and self.isStreamKConstantsToVgprEnabled(kernel): + numSKConsts = 5 # ItersPerTile, MagicNumberItersPerTile, MagicShiftItersPerTile, SKItersPerWG, StreamKIdx + if kernel["StreamK"] >= 2: + numSKConsts += 2 # skGrid, skTiles + self.states.startVgprSKConsts = vgprIdx + self.states.numVgprSKConsts = numSKConsts + vgprIdx += numSKConsts + + # TODO: Serial is always the first/last register in the pool so the store + # code doesn't have to deal with fragmentation + self.states.startVgprSerial = vgprIdx + vgprIdx += 1 # for vgpr serial id + + self.states.totalVgprs = max(vgprIdx, self.states.c.numVgprValu) + if self.states.totalVgprs < 0 or self.states.totalVgprs > self.states.regCaps["MaxVgpr"]: + raise RuntimeError("Generating asm kernel error: total vgpr: %u not in [0, %u].\n" % (self.states.totalVgprs, self.states.regCaps["MaxVgpr"])) + + agprLimit = self.states.regCaps["PhysicalMaxVgpr"] - self.states.regCaps["MaxVgpr"] + if self.states.totalAgprs > agprLimit: + raise RuntimeError("Generating asm kernel error: total agpr: %u not in [0, %u].\n" % (self.states.totalAgprs, agprLimit) ) + + # VGPR alloc marker + + + def vgprAllocationImplSubtile(): + self.states.maxLimitAgprs = self.states.regCaps["PhysicalMaxVgpr"] - self.states.regCaps["MaxVgpr"] + if kernel["EnableMatrixInstruction"]: + #jgolds bpeCinternal because we are allocating accumulation registers here + self.states.c.numVgprValu = (kernel["ThreadTile0"]*kernel["ThreadTile1"]*self.states.bpeCinternal)//self.states.bpr - self.states.b.startVgprValu = vgprIdx - vgprIdx += self.states.b.numVgprValu - numVgprValuPackB = 0 - if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"] and not kernel["enableLDSTrB"]: - self.states.b.startVgprValuPack = vgprIdx - if self.states.lrvwTileB > 1: - numVgprValuPackB = ceil(kernel["VectorWidthB"] * tensorParametersB["bpe"] / self.states.bpr) * kernel["MIWaveTileB"] // kernel["VectorWidthB"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadB"] - if self.states.packDTVB: - # pack DTV case, double the number - numVgprValuPackB *= 2 - elif (kernel["UsePLRPack"] and self.states.numItersPLR): - numVgprValuPackB //= 2 - else: - numVgprValuPackB = self.states.b.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackB * (int(4/tensorParametersB["bpeDS"]) - 1) - vgprIdx += numVgprValuPackB - self.states.b.startVgprG2L = None - if (not kernel["DirectToLdsB"] or self.do["KeepDirectToLdsAlloc"]) and not kernel["enableTDMB"]: - # DirectToVgpr + pack or input conversion case, overlap G2L and ValuPack - if self.states.packDTVB: - self.states.b.startVgprG2L = self.states.b.startVgprValuPack - elif self.states.convDTVB: - self.states.b.startVgprG2L = self.states.b.startVgprValu - # if PGR = True, PAP could be possibly enabled, we move G2LB later to prevent it from being reclaimed - # otherwise, put G2L here since it can overlap valu - if (not kernel["PrefetchGlobalRead"]): # g2l can overlap valu - self.states.b.startVgprG2L = self.states.b.startVgprValu - vgprIdx = self.states.b.startVgprValu \ - + max(self.states.b.numVgprValu + numVgprValuPackB, self.states.b.numVgprG2LAllocated) - - if ((tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"]) or \ - (tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"]) or \ - (kernel["ProblemType"]["Sparse"] and not kernel["UnrollMajorLDSMetadata"] and (kernel["MIInputPerThreadMetadata"] == 4))) \ - and (kernel["ProblemType"]["DataType"].isInt8() or kernel["ProblemType"]["DataType"].is8bitFloat()) or \ - (self.states.asmCaps["HasSWMMAC_gfx1250"] and kernel["ProblemType"]["Sparse"] and not kernel["UnrollMajorLDSMetadata"]): - self.states.a.startVgprValuPackTemp = vgprIdx - self.states.b.startVgprValuPackTemp = vgprIdx + vgprIdx = 0 + self.states.startVgprSerial = vgprIdx; vgprIdx += 1 + #self.states.c.startVgprValu = vgprIdx; - self.states.a.startVgprValuCvtTemp = -1 - self.states.b.startVgprValuCvtTemp = -1 - if kernel["ConvertAfterDS"]: - if ((tensorParametersA["bpe"] > tensorParametersA["bpeDS"]) and kernel["ProblemType"]["DataTypeA"].is8bitFloat()): - self.states.a.startVgprValuCvtTemp = vgprIdx - if ((tensorParametersB["bpe"] > tensorParametersB["bpeDS"]) and kernel["ProblemType"]["DataTypeB"].is8bitFloat()): - self.states.b.startVgprValuCvtTemp = vgprIdx - if self.states.a.startVgprValuCvtTemp != -1 or self.states.b.startVgprValuCvtTemp != -1: - vgprIdx += 2 + #vgprIdx += self.states.c.numVgprValu + self.states.totalVgprs = vgprIdx - if kernel["ProblemType"]["Sparse"]: - if kernel["DirectToVgprSparseMetadata"]: - self.states.m.startVgprValu = vgprIdx - vgprIdx += self.states.m.numVgprValu - else: - # TODO: alignment hack, figure out a better solution - vgprIdx = ((vgprIdx+1)//2)*2 - if(self.states.archCaps["VgprBank"]): - vgprIdx += 1 - # gfx1250 - if self.states.m.numVgprValu >= 2: - vgprIdx = ((vgprIdx+1)//2)*2 - self.states.m.startVgprValu = vgprIdx - vgprIdx += self.states.m.numVgprValu - numVgprValuPackMetadata = 0 - if not kernel["UnrollMajorLDSMetadata"] and not kernel["enableLDSTrMetadata"]: - self.states.m.startVgprValuPack = vgprIdx - if self.states.lrvwTileMetadata > 1: - numVgprValuPackMetadata = roundUp(kernel["VectorWidthMetadata"] * tensorParametersM["bpe"] / self.states.bpr) * kernel["MIWaveTileMetadata"] // kernel["VectorWidthMetadata"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadMetadata"] - else: - numVgprValuPackMetadata = (kernel["MIInputPerThreadMetadata"]-1) * kernel["MIWaveTileMetadata"] * kernel["InnerUnroll"] * self.states.numVgprBufferPackMetadata - vgprIdx += numVgprValuPackMetadata - self.states.m.startVgprG2L = None - if not kernel["PrefetchGlobalRead"]: # g2l can overlap valu - self.states.m.startVgprG2L = self.states.m.startVgprValu - vgprIdx = self.states.m.startVgprValu \ - + max(self.states.m.numVgprValu + numVgprValuPackMetadata, self.states.m.numVgprG2LAllocated) - - # Registers allocated above this point can be used as temps during setup - # Registers above here are reserved in initC, near the end of the setup - # code - self.states.lastValuAB = vgprIdx - - #----------- - self.states.firstVgprForReads = vgprIdx - if self.states.a.startVgprG2L is None and self.states.a.numVgprG2LAllocated > 0: - # TODO: alignment hack, figure out a better solution - vgprIdx = ((vgprIdx+1)//2)*2 - self.states.a.startVgprG2L = vgprIdx - if ("ULSGRODoubleG2L" in kernel) and kernel["ULSGRODoubleG2L"] == 1: - vgprIdx += self.states.a.numVgprG2LAllocated*2 - else: - vgprIdx += self.states.a.numVgprG2LAllocated - if self.states.b.startVgprG2L is None and self.states.b.numVgprG2LAllocated > 0: - # TODO: alignment hack, figure out a better solution - vgprIdx = ((vgprIdx+1)//2)*2 - self.states.b.startVgprG2L = vgprIdx - if ("ULSGRODoubleG2L" in kernel) and kernel["ULSGRODoubleG2L"] == 1: - vgprIdx += self.states.b.numVgprG2LAllocated*2 - else: - vgprIdx += self.states.b.numVgprG2LAllocated - if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - if self.states.m.startVgprG2L is None: - # TODO: alignment hack, figure out a better solution - vgprIdx = ((vgprIdx+1)//2)*2 - self.states.m.startVgprG2L = vgprIdx; vgprIdx += self.states.m.numVgprG2LAllocated + #self.states.totalVgprs += self.states.a.tileInfo.numGRPerSubtile + #self.states.totalVgprs += self.states.b.tileInfo.numGRPerSubtile - # GlobalRead, LocalWrite, LocalRead, G2L can be reclaimed, extend the "lastVgprForReads" value - self.states.lastVgprForReads = vgprIdx + #self.states.totalVgprs += self.states.a.tileInfo.numLRPerSubtile + #self.states.totalVgprs += self.states.b.tileInfo.numLRPerSubtile - if boolMoveLocalReadAddrA: - self.states.a.startVgprLocalReadAddr = vgprIdx - vgprIdx += self.states.a.numVgprLocalReadAddr - elif boolMoveLocalReadAddrB: - self.states.b.startVgprLocalReadAddr = vgprIdx - vgprIdx += self.states.b.numVgprLocalReadAddr + return - #----------- - if kernel["ProblemType"]["Gradient"] and kernel["ProblemType"]["UseBias"]: - if kernel["ProblemType"]["BiasSrc"] == "A": - self.states.bias.numVgprValu = kernel["MIWaveTile"][0] - elif kernel["ProblemType"]["BiasSrc"] == "B": - self.states.bias.numVgprValu = kernel["MIWaveTile"][1] - else: - self.states.bias.numVgprValu = 0 - self.states.bias.numVgprValu *= max(kernel["ProblemType"]["ComputeDataType"].numRegisters(), 1) - else: - self.states.bias.numVgprValu = 0 - self.states.bias.startVgprValu = vgprIdx - vgprIdx += self.states.bias.numVgprValu - - #----------- - if kernel["ProblemType"]["OutputAmaxD"]: - self.startVgprAmaxOut = vgprIdx - self.startVgprAmaxOutB = vgprIdx + 1 - vgprIdx += 2 - - self.states.startVgprAddressDbg = vgprIdx - vgprIdx += numVgprAddressDbg - - # for cgemm or zgemm + MIAV case, allocate 2 or 4 vgpr for alpha calculation (cannot use tmp vgpr in write batch) - if kernel["ProblemType"]["DataType"].isComplex() \ - and kernel["MIArchVgpr"]: - - # need proper alignment - vgprIdx = ((vgprIdx+2 - 1)//2)*2 - self.states.startVgprAlphaTmp = vgprIdx - vgprIdx += kernel["ProblemType"]["DataType"].numRegisters() - - # for swapping vgpr offsets of different lds buffers - if self.states.a.numVgprLocalReadSwapAddr > 0: - self.states.a.startVgprLocalReadSwapAddr = vgprIdx - vgprIdx += 1 - if self.states.mxsa.numVgprLocalReadSwapAddr > 0: - self.states.mxsa.startVgprLocalReadSwapAddr = vgprIdx - vgprIdx += 1 - if self.states.m.numVgprLocalReadSwapAddr > 0: - self.states.m.startVgprLocalReadSwapAddr = vgprIdx - vgprIdx += 1 - if self.states.b.numVgprLocalReadSwapAddr > 0: - self.states.b.startVgprLocalReadSwapAddr = vgprIdx - vgprIdx += 1 - if self.states.mxsb.numVgprLocalReadSwapAddr > 0: - self.states.mxsb.startVgprLocalReadSwapAddr = vgprIdx - vgprIdx += 1 - if self.states.a.numVgprLocalWriteSwapAddr > 0: - self.states.a.startVgprLocalWriteSwapAddr = vgprIdx - vgprIdx += 1 - if self.states.m.numVgprLocalWriteSwapAddr > 0: - self.states.m.startVgprLocalWriteSwapAddr = vgprIdx - vgprIdx += 1 - if self.states.b.numVgprLocalWriteSwapAddr > 0: - self.states.b.startVgprLocalWriteSwapAddr = vgprIdx - vgprIdx += 1 - if kernel["ProblemType"]["MXBlockA"]: - if self.states.mxsa.numVgprLocalWriteSwapAddr > 0: - self.states.mxsa.startVgprLocalWriteSwapAddr = vgprIdx - vgprIdx += 1 - if kernel["ProblemType"]["MXBlockB"]: - if self.states.mxsb.numVgprLocalWriteSwapAddr > 0: - self.states.mxsb.startVgprLocalWriteSwapAddr = vgprIdx - vgprIdx += 1 - # X32F Emulation initializations - # meaning of variables - # useDirect32XEmulation (separate values for A and B): - # True: allocate extra buffer (either full (tranpose only) or interleave) to eliminate extra v_mov - # False: use temp Treg only for conversion (need some v_mov) - # TF32EmuUseTransposeCode (separate values for A and B. For wider local read(lrvwTile>1) only): - # True: Generate extra transpose code (with v_swap) - # False: Use index tranpose and no tranpose code - # This is for cvt + sub only (means not dot2, not mfma) - # TF32EmuInterleaveTreg: - # True: Allocate T reg with interleaving X regs for dest of local read - # T0-3 - # X4-7 - # T4-7 - # X8-11 - # .... - # This works with useDirect32XEmulation=Trie - # Wider local read case, we need TransposeCode=True - # False: Does not use interleave layout - # ider local read + index transpose case, this needs to be False - def initTF32Emu(): - # for UseF32XEmulation only - if not kernel["UseF32XEmulation"]: - return 0, 0 - self.states.a.useDirect32XEmulationThis = self.states.a.useDirect32XEmulationNext = kernel["UseDirect32XEmulation"] - self.states.b.useDirect32XEmulationThis = self.states.b.useDirect32XEmulationNext = kernel["UseDirect32XEmulation"] - self.states.mxsa.useDirect32XEmulationThis = False - self.states.mxsb.useDirect32XEmulationThis = False - self.states.a.TF32EmuUseTransposeCode = False - self.states.b.TF32EmuUseTransposeCode = False - self.states.mxsa.TF32EmuUseTransposeCode = False - self.states.mxsb.TF32EmuUseTransposeCode = False - self.states.a.TF32EmuInterleaveTreg = False - self.states.b.TF32EmuInterleaveTreg = False - self.states.mxsa.TF32EmuInterleaveTreg = False - self.states.mxsb.TF32EmuInterleaveTreg = False - # do prefetch and scheduling for full pack code - # this sceduling opt is for non CMS. No need to enable it for CMS - self.states.doFullPackCodePrefetch = kernel["UsePLRPack"] and not kernel["UseCustomMainLoopSchedule"] - # prefetch pack/prePack scheduling for non CMS only - # We do not enable any ppack scheduling optimizations for PLR=0 - if (not kernel["UseCustomMainLoopSchedule"]) and self.states.numItersPLR: - # enabhe prepack scheduling for this loop only for DTLA + B - if kernel["DirectToLds"] == 1: - # do packPre scheduling for This loop only not CLR or SubIter - self.states.doPackPreSchedulingThisLoop = (not kernel["ClusterLocalRead"]) or kernel["ForceUnrollSubIter"] - self.states.doPackPreSchedulingNextLoop = True - if self.states.tailloopInNll: - # disable all TF32 scheduling if tailloopInNll is enabled - self.states.doFullPackCodePrefetch = False - self.states.doPackPreSchedulingThisLoop = False - self.states.doPackPreSchedulingNextLoop = False - numVgprsEmuA = initTF32EmuAB(self.states.a, self.states.lrvwTileA) - numVgprsEmuB = initTF32EmuAB(self.states.b, self.states.lrvwTileB) - return numVgprsEmuA, numVgprsEmuB - def initTF32EmuAB(sAorB: ABMatrixInfo, lrvwTile): - # for UseF32XEmulation only - if not kernel["UseF32XEmulation"]: - return 0 - # number of Vreg for interleaveTreg. Half of ValuA or B. Need same block number as Valu - numVForInterleave = sAorB.numVgprValu // 2 - numVForIndexTranspose = sAorB.numVgprValuPerBlock - if kernel["ForceUnrollSubIter"]: - # SubIter case, we devide local read into half at each prefetch - numVForIndexTranspose //= 2 - # full prefetch pack case, we need to allocate full ValuA/B buffers - if self.states.doFullPackCodePrefetch: - if kernel["UseDirect32XEmulationInterleaveTreg"]: - # use conventional Treg allocatin (interleaved Treg and Xreg) - numV = numVForInterleave - # enable TF32EmuInterleaveTreg - sAorB.TF32EmuInterleaveTreg = True - else: - # allocate single full buffer as dest of local read - numV = numVForIndexTranspose - sAorB.useDirect32XEmulationThis = True - sAorB.useDirect32XEmulationNext = True - if kernel["UseMFMAF32XEmulation"]: - # use transpose code for MFMA - sAorB.useTransposeCodeThis = True - sAorB.useTransposeCodeNext = True - return numV - # reg layout setting - # At init stage, seting is same for this and next - if sAorB.useDirect32XEmulationThis: - # enable TF32EmuInterleaveTreg - sAorB.TF32EmuInterleaveTreg = True - numV = numVForInterleave - if lrvwTile > 1: - # useDirect32XEmulation case - # Use wider local read + transpose code - sAorB.useTransposeCodeThis = True - sAorB.useTransposeCodeNext = True - else: - # no useDirect32XEmulation case, use temp reg version - numV = adjustNumVForTF32Emu(sAorB, lrvwTile) - return numV - - def adjustNumVForTF32Emu(sAorB: ABMatrixInfo, lrvwTile): - # for UseF32XEmulation only - if not kernel["UseF32XEmulation"]: - return 0 - # no T reg for both This and Next Loop - if lrvwTile > 1: - # use tranpose code for wider local read - sAorB.useTransposeCodeThis = True - sAorB.useTransposeCodeNext = True - numV = 0 - # disable TF32EmuInterleaveTreg - sAorB.TF32EmuInterleaveTreg = False - sAorB.useDirect32XEmulationThis = False - sAorB.useDirect32XEmulationNext = False - return numV - - def checkVregOverflowTF32Emu(vgprIdx, numV): - # for UseF32XEmulation only - if not kernel["UseF32XEmulation"]: - return False - # Do not allow adjustment for CMS or doFullPackCodePrefetch - if kernel["UseCustomMainLoopSchedule"] or self.states.doFullPackCodePrefetch: - return False - # We need to consider 2 more vreg (Serial tmp) - # Looks like we need more tmp vreg at tailloop - # So far, max 32 tmp vregs might be used. - # set 2 + 32 as buffer (tentative) - # MFMA case, need 2 more - bufferVregNum = 2 + 32 - if kernel["UseMFMAF32XEmulation"]: - bufferVregNum += 2 - return vgprIdx + bufferVregNum + numV > self.states.regCaps["MaxVgpr"] - - # initial TF32Emu setting - numVgprsEmuA, numVgprsEmuB = initTF32Emu() - # numVreg adjustment - # step 1 Adjustment for lrvwTileA/B==1 - # start from B - needAdjustment = checkVregOverflowTF32Emu(vgprIdx, numVgprsEmuA + numVgprsEmuB) - if needAdjustment and self.states.lrvwTileB == 1: - numVgprsEmuB = adjustNumVForTF32Emu(self.states.b, self.states.lrvwTileB) - # then, check A - needAdjustment = checkVregOverflowTF32Emu(vgprIdx, numVgprsEmuA + numVgprsEmuB) - if needAdjustment and self.states.lrvwTileA == 1 and not self.states.doFullPackCodePrefetch: - numVgprsEmuA = adjustNumVForTF32Emu(self.states.a, self.states.lrvwTileA) - # step 2 Adjustment for lrvwTileA/B>1 - # start from B - needAdjustment = checkVregOverflowTF32Emu(vgprIdx, numVgprsEmuA + numVgprsEmuB) - if needAdjustment and self.states.lrvwTileB > 1 and not self.states.doFullPackCodePrefetch: - numVgprsEmuB = adjustNumVForTF32Emu(self.states.b, self.states.lrvwTileB) - # then, checkA - needAdjustment = checkVregOverflowTF32Emu(vgprIdx, numVgprsEmuA + numVgprsEmuB) - if needAdjustment and self.states.lrvwTileA > 1 and not self.states.doFullPackCodePrefetch: - numVgprsEmuA = adjustNumVForTF32Emu(self.states.a, self.states.lrvwTileA) - # final adjustment - # disable UseMFMAF32XEmulation and save 2 vregs - if kernel["UseMFMAF32XEmulation"]: - if checkVregOverflowTF32Emu(vgprIdx, numVgprsEmuA + numVgprsEmuB): - # disable UseMFMAF32XEmulation and use Dot2 instead - kernel["UseMFMAF32XEmulation"] = False - kernel["UseDot2F32XEmulation"] = True - # vreg allocation for UseMFMAF32XEmulation - if kernel["UseMFMAF32XEmulation"]: - vgprIdx = ((vgprIdx+1)//2)*2 #align 64 bit - self.states.startVgprIdentityMatrix = vgprIdx - vgprIdx+=2 - numVgprsEmu = numVgprsEmuA + numVgprsEmuB - self.states.a.numVgprEmu = numVgprsEmuA - self.states.b.numVgprEmu = numVgprsEmuB - if numVgprsEmu > 0: - #align 64 bit - vgprIdx = ((vgprIdx+1)//2)*2 - self.states.a.startVgprCvt = vgprIdx - vgprIdx += numVgprsEmuA # for vgpr 32XEmulation A - self.states.b.startVgprCvt = vgprIdx - vgprIdx += numVgprsEmuB # for vgpr 32XEmulation B - - if kernel["StreamK"] and self.isStreamKConstantsToVgprEnabled(kernel): - numSKConsts = 5 # ItersPerTile, MagicNumberItersPerTile, MagicShiftItersPerTile, SKItersPerWG, StreamKIdx - if kernel["StreamK"] >= 2: - numSKConsts += 2 # skGrid, skTiles - self.states.startVgprSKConsts = vgprIdx - self.states.numVgprSKConsts = numSKConsts - vgprIdx += numSKConsts - - # TODO: Serial is always the first/last register in the pool so the store - # code doesn't have to deal with fragmentation - self.states.startVgprSerial = vgprIdx - vgprIdx += 1 # for vgpr serial id - - self.states.totalVgprs = max(vgprIdx, self.states.c.numVgprValu) - if self.states.totalVgprs < 0 or self.states.totalVgprs > self.states.regCaps["MaxVgpr"]: - raise RuntimeError("Generating asm kernel error: total vgpr: %u not in [0, %u].\n" % (self.states.totalVgprs, self.states.regCaps["MaxVgpr"])) - - agprLimit = self.states.regCaps["PhysicalMaxVgpr"] - self.states.regCaps["MaxVgpr"] - if self.states.totalAgprs > agprLimit: - raise RuntimeError("Generating asm kernel error: total agpr: %u not in [0, %u].\n" % (self.states.totalAgprs, agprLimit) ) + # Dispatch to different VGPR allocation logic for subtile-based impl + if kernel["UseSubtileImpl"]: + vgprAllocationImplSubtile() + else: + vgprAllocationImplClassic() ######################################## # SGPR Allocation @@ -7695,6 +8145,7 @@ def checkVregOverflowTF32Emu(vgprIdx, numV): self.defineSgpr("DebugKernelItems", 1) # the sgprs overlap with wg ids + # TODO: For subtileimpl, consider shadowInit param as well if self.states.doShadowInit and kernel["BufferStore"]: self.defineSgpr("SrdD", 4, 4) self.defineSgpr("SrdC", 4, 4) @@ -7791,19 +8242,20 @@ def checkVregOverflowTF32Emu(vgprIdx, numV): self.defineSgpr("skTiles", 1) self.states.numSgprStreamK += 2 - if kernel["LocalWriteUseSgprA"]: + if not kernel["UseSubtileImpl"]: + if kernel["LocalWriteUseSgprA"]: self.defineSgpr("LocalWriteAddrA", 1) - if kernel["LocalWriteUseSgprB"]: + if kernel["LocalWriteUseSgprB"]: self.defineSgpr("LocalWriteAddrB", 1) - if kernel["ProblemType"]["MXBlockA"] and kernel["LocalWriteUseSgprMXSA"]: - self.defineSgpr("LocalWriteAddrMXSA", 1) - if kernel["ProblemType"]["MXBlockB"] and kernel["LocalWriteUseSgprMXSB"]: - self.defineSgpr("LocalWriteAddrMXSB", 1) + if kernel["ProblemType"]["MXBlockA"] and kernel["LocalWriteUseSgprMXSA"]: + self.defineSgpr("LocalWriteAddrMXSA", 1) + if kernel["ProblemType"]["MXBlockB"] and kernel["LocalWriteUseSgprMXSB"]: + self.defineSgpr("LocalWriteAddrMXSB", 1) # Allocate registers to swap between lds buffers - if self.states.useCommonSgprSwap: + if self.states.useCommonSgprSwap and not kernel["UseSubtileImpl"]: self.defineSgpr("SwapCommon", 1) - elif kernel["StoreSwapAddr"]: + elif not kernel["UseSubtileImpl"] and (kernel["StoreSwapAddr"]): if kernel["LocalWriteUseSgprA"]: self.defineSgpr("SwapA", 1) if kernel["LocalWriteUseSgprB"]: @@ -7826,23 +8278,65 @@ def checkVregOverflowTF32Emu(vgprIdx, numV): if kernel["GlobalSplitU"] != 0: self.defineSgpr("GSU", 1) # Can't move to the front because of the preload arguments + # Collect SGPRs to allocate via the deferred interleaved loop. + # Using a list allows allocation order to be controlled, minimising + # alignment holes (e.g. keeping the pool on a 4-aligned boundary + # immediately before any 4-aligned SrdWS allocation). + requiredUnalignedSgprVar = [] + requiredAligned4SgprVar = [] + if kernel["StreamK"]: - # StreamK vars. if not self.isStreamKConstantsToVgprEnabled(kernel): - self.defineSgpr("StreamKIdx", 1) - self.defineSgpr("StreamKIter", 1) - self.defineSgpr("StreamKIterEnd", 1) - self.defineSgpr("StreamKLocalStart", 1) - self.defineSgpr("StreamKLocalEnd", 1) + requiredUnalignedSgprVar.append("StreamKIdx") + requiredUnalignedSgprVar += [ + "StreamKIter", + "StreamKIterEnd", + "StreamKLocalStart", + "StreamKLocalEnd", + ] if len(kernel["SpaceFillingAlgo"]): - self.defineSgpr("StreamKTileID", 1) + requiredUnalignedSgprVar.append("StreamKTileID") if kernel["StreamKAtomic"] == 0: - self.defineSgpr("SrdWS", 4, 4) + requiredAligned4SgprVar.append("SrdWS") + + if kernel["UseSubtileImpl"]: + requiredUnalignedSgprVar.append("LocalWriteBaseAddrA") + requiredUnalignedSgprVar.append("LocalWriteBaseAddrB") + if kernel["ProblemType"]["MXBlockA"]: + requiredUnalignedSgprVar.append("LocalWriteBaseAddrMXSA") + if kernel["ProblemType"]["MXBlockB"]: + requiredUnalignedSgprVar.append("LocalWriteBaseAddrMXSB") + requiredUnalignedSgprVar.append("SwapA") + requiredUnalignedSgprVar.append("SwapB") + if kernel["ProblemType"]["MXBlockA"]: + requiredUnalignedSgprVar.append("SwapMXSA") + if kernel["ProblemType"]["MXBlockB"]: + requiredUnalignedSgprVar.append("SwapMXSB") + if kernel["ProblemType"]["Sparse"] and kernel["LocalWriteUseSgprMetadata"]: + requiredUnalignedSgprVar.append("SwapMetadata") + + # Actual allocation: prioritise 4-aligned SGPRs whenever the pool is + # already on a 4-aligned boundary, otherwise consume unaligned ones. + while len(requiredUnalignedSgprVar) or len(requiredAligned4SgprVar): + if self.sgprPool.size() % 4 == 0 and len(requiredAligned4SgprVar): + var = requiredAligned4SgprVar.pop() + self.defineSgpr(var, 4, 4) + elif len(requiredUnalignedSgprVar): + var = requiredUnalignedSgprVar.pop() + self.defineSgpr(var, 1) + # These SGPRs aren't used right away, add them to sgpr pool temporarily if self.states.doShadowInit and kernel["BufferStore"]: self.addSgprVarToPool("SrdC") if kernel["StreamK"] and kernel["StreamKAtomic"] == 0: self.addSgprVarToPool("SrdWS") + + if kernel["BAddrInterleave"]: + self.defineSgpr("BInterleaveG", 1) + self.addSgprVarToPool("BInterleaveG") + if kernel["KRingShift"]: + self.defineSgpr("KRingShift", 1) + self.addSgprVarToPool("KRingShift") # gfx1250 frees the SK constant SGPRs later in moveStreamKConstantsToVgpr # after their values have been copied to VGPRs. Freeing them here would let # temp allocs clobber kernel arguments before they are copied. @@ -8824,13 +9318,16 @@ def _getKernelSource(self, kernel: Solution): """ Returns the source of the kernel, either C++ or assembly. """ - fileString = "" tensorParametersA = {} tensorParametersB = {} self._initKernel(kernel, tensorParametersA, tensorParametersB) self.stringIdx = 0 - (error, kb) = self.kernelBody(kernel, tensorParametersA, tensorParametersB) + if not kernel["UseSubtileImpl"]: + (error, kb) = self.kernelBody(kernel, tensorParametersA, tensorParametersB) + else: + (error, kb) = self.kernelBodySubtile(kernel, tensorParametersA, tensorParametersB) + fileString += str(kb) if error != 0: diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py index 83bfc7ec360..63e2c22fed7 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py @@ -31,7 +31,7 @@ MUBUFModifiers, SMEMModifiers, EXEC, VCC, RegisterContainer, \ DPPModifiers, vgpr, sgpr, accvgpr, mgpr, ContinuousRegister, \ HWRegContainer, GLOBALModifiers, MemTokenData -from rocisa.instruction import SGetPositivePCOffset, SLongBranchPositive, SCLongBranchScc0, SCLongBranchScc1, SCLongBranchVccnz, \ +from rocisa.instruction import SGetPositivePCOffset, SLongBranch, SLongBranchPositive, SLongBranchNegative, SCLongBranchScc0, SCLongBranchScc1, SCLongBranchVccnz, \ SMulInt64to32, VCvtBF16toFP32 from rocisa.functions import vectorStaticDivide, vectorStaticRemainder, vectorUInt32CeilDivideAndRemainder, \ vectorStaticDivideAndRemainder, scalarStaticDivideAndRemainder, scalarStaticCeilDivide, \ @@ -591,6 +591,20 @@ def removeGRSrdVariableSgprsFromPool(self, kernel): return module + def undefineSubtileMainLoopSgprs(self, kernel): + """Undefine SGPRs used only during the main loop that are not needed in the post-loop. + Called for subtile kernels after deallocOffsetRegisters and before endSummation/post-loop.""" + module = Module("UndefineSubtileMainLoopSgprs") + sgprsToUndefine = [ + "LocalWriteBaseAddrA", "LocalWriteBaseAddrB", + "LocalWriteBaseAddrMXSA", "LocalWriteBaseAddrMXSB", + "SwapA", "SwapB", "SwapMXSA", "SwapMXSB", + ] + for name in sgprsToUndefine: + if name in self.sgprs: + module.add(self.undefineSgpr(name)) + return module + def removeGROffsetsVariableSgprsFromPool(self, kernel): module = Module("RemoveGROffsetSgprsFromPool") @@ -873,511 +887,524 @@ def macroAndSet(self, kernel, tPA, tPB) -> Module: ######################################## # VGPR Macros ######################################## - module.addComment2("VGPR Assignments for MX") - module.add(RegSet("v", "vgprMXSBase", 0)) - - moduleVgprMacroMXS = Module("VALU/G2L Vgpr Macro") - moduleVgprMacroValuMXSDummy = Module("VALU Dummy Vgpr Macro") - moduleVgprMacroValuMXSA = Module("VALUA Vgpr Macro") - moduleVgprMacroValuMXSB = Module("VALUB Vgpr Macro") - moduleVgprMacroValuMXSAPack = Module("VALUA Pack Vgpr Macro") - moduleVgprMacroValuMXSBPack = Module("VALUB Pack Vgpr Macro") - moduleVgprMacroG2LMXSA = Module("G2LA Vgpr Macro") - moduleVgprMacroG2LMXSB = Module("G2LB Vgpr Macro") + def macroAndSetImplClassic(): - if bool(kernel["ProblemType"]["MXBlockA"]) ^ bool(kernel["ProblemType"]["MXBlockB"]): - moduleVgprMacroValuMXSDummy.add(RegSet("v", "vgprValuMXSDummy", "vgprMXSBase", 0)) + module.addComment2("VGPR Assignments for MX") + module.add(RegSet("v", "vgprMXSBase", 0)) + + moduleVgprMacroMXS = Module("VALU/G2L Vgpr Macro") + moduleVgprMacroValuMXSDummy = Module("VALU Dummy Vgpr Macro") + moduleVgprMacroValuMXSA = Module("VALUA Vgpr Macro") + moduleVgprMacroValuMXSB = Module("VALUB Vgpr Macro") + moduleVgprMacroValuMXSAPack = Module("VALUA Pack Vgpr Macro") + moduleVgprMacroValuMXSBPack = Module("VALUB Pack Vgpr Macro") + moduleVgprMacroG2LMXSA = Module("G2LA Vgpr Macro") + moduleVgprMacroG2LMXSB = Module("G2LB Vgpr Macro") + + if bool(kernel["ProblemType"]["MXBlockA"]) ^ bool(kernel["ProblemType"]["MXBlockB"]): + moduleVgprMacroValuMXSDummy.add(RegSet("v", "vgprValuMXSDummy", "vgprMXSBase", 0)) + + if kernel["ProblemType"]["MXBlockA"]: + ri = 0 + if self.states.mxsa.numVgprValu > 0: # Do not generate vgprValuMXSA if numVgprValuA is 0 + numBiFactor = numBi + if kernel["DirectToVgprA"] and (self.states.packDTVA or self.states.convDTVA): + # DirectToVgpr case, we need LoopIters * 2 buffers + numBiFactor = kernel["LoopIters"] * 2 + if self.states.lrvwTileMXSA > 1: + moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSA_X0_I0_BASE", "vgprMXSBase", self.states.mxsa.startVgprValu)) + for bi in range(0,numBiFactor): # buffer indices + for iui in range(0, kernel["InnerUnroll"]): + moduleVgprMacroValuMXSA.add(RegSet("v", "vgprValuMXSA_X%u_I%u"%(bi,iui), "vgprValuMXSA_X0_I0_BASE", ri)) + ri += self.states.mxsa.numVgprValuPerBlock + if not kernel["UnrollMajorLDSA"]: + ri = 0 + ri = 0 + if not kernel["UnrollMajorLDSA"]: + moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSA_X0_I0_D0_PACK", "vgprMXSBase", self.states.mxsa.startVgprValuPack)) + for bi in range(0,numBiFactor): # buffer indices + for iui in range(0, kernel["InnerUnroll"]): + for data in range(0,kernel["MIInputPerThreadMXSA"]): + moduleVgprMacroValuMXSAPack.add(RegSet("v", "vgprValuMXSA_X%u_I%u_D%u"%(bi,iui,data),"vgprValuMXSA_X0_I0_D0_PACK", ri)) + ri += ceil(kernel["VectorWidthMXSA"] / self.states.bpr) * kernel["MIWaveTileA"] // kernel["VectorWidthMXSA"] + else: + moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSA_X0_I0_BASE", "vgprMXSBase", self.states.mxsa.startVgprValu)) + for bi in range(0,numBiFactor): # buffer indices + for iui in range(0, kernel["InnerUnroll"]): + moduleVgprMacroValuMXSA.add(RegSet("v", "vgprValuMXSA_X%u_I%u"%(bi,iui), "vgprValuMXSA_X0_I0_BASE", ri)) + ri += self.states.mxsa.numVgprValuPerBlock + ri = 0 + if not kernel["UnrollMajorLDSA"]: + moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSA_X0_I0_D0_PACK", "vgprMXSBase", self.states.mxsa.startVgprValuPack)) + for data in range(1,int(self.states.bpr)): + for bi in range(0,numBiFactor): # buffer indices + if bi % self.states.numVgprBufferPackMXSA == 0: + ri = (data-1) * kernel["InnerUnroll"] * self.states.numVgprBufferPackMXSA * self.states.mxsa.numVgprValuPerBlock + for iui in range(0, kernel["InnerUnroll"]): + moduleVgprMacroValuMXSAPack.add(RegSet("v", "vgprValuMXSA_X%u_I%u_D%u"%(bi,iui,data),"vgprValuMXSA_X0_I0_D0_PACK", ri)) + ri += self.states.mxsa.numVgprValuPerBlock + + if kernel["ProblemType"]["MXBlockB"]: + ri = 0 + if self.states.mxsb.numVgprValu > 0: # Do not generate vgprValuMXSB if numVgprValuB is 0 + numBiFactor = numBi + if kernel["DirectToVgprB"] and (self.states.packDTVB or self.states.convDTVB): + # DirectToVgpr case, we need LoopIters * 2 buffers + numBiFactor = kernel["LoopIters"] * 2 + if self.states.lrvwTileMXSB > 1: + moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSB_X0_I0_BASE", "vgprMXSBase", self.states.mxsb.startVgprValu)) + for bi in range(0,numBiFactor): # buffer indices + for iui in range(0, kernel["InnerUnroll"]): + moduleVgprMacroValuMXSB.add(RegSet("v", "vgprValuMXSB_X%u_I%u"%(bi,iui), "vgprValuMXSB_X0_I0_BASE", ri)) + ri += self.states.mxsb.numVgprValuPerBlock + if not kernel["UnrollMajorLDSB"]: + ri = 0 + ri = 0 + if not kernel["UnrollMajorLDSB"]: + moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSB_X0_I0_D0_PACK", "vgprMXSBase", self.states.mxsb.startVgprValuPack)) + for bi in range(0,numBiFactor): # buffer indices + for iui in range(0, kernel["InnerUnroll"]): + for data in range(0,kernel["MIInputPerThreadMXSB"]): + moduleVgprMacroValuMXSBPack.add(RegSet("v", "vgprValuMXSB_X%u_I%u_D%u"%(bi,iui,data), "vgprValuMXSB_X0_I0_D0_PACK", ri)) + ri += ceil(kernel["VectorWidthMXSB"] / self.states.bpr) * kernel["MIWaveTileB"] // kernel["VectorWidthMXSB"] + else: + moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSB_X0_I0_BASE", "vgprMXSBase", self.states.mxsb.startVgprValu)) + for bi in range(0,numBiFactor): # buffer indices + for iui in range(0, kernel["InnerUnroll"]): + moduleVgprMacroValuMXSB.add(RegSet("v", "vgprValuMXSB_X%u_I%u"%(bi,iui), "vgprValuMXSB_X0_I0_BASE", ri)) + ri += self.states.mxsb.numVgprValuPerBlock + ri = 0 + if not kernel["UnrollMajorLDSB"]: + moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSB_X0_I0_D0_PACK", "vgprMXSBase", self.states.mxsb.startVgprValuPack)) + for data in range(1,int(self.states.bpr)): + for bi in range(0,numBiFactor): # buffer indices + if bi % self.states.numVgprBufferPackMXSB == 0: + ri = (data-1) * kernel["InnerUnroll"] * self.states.numVgprBufferPackMXSB * self.states.mxsb.numVgprValuPerBlock + for iui in range(0, kernel["InnerUnroll"]): + moduleVgprMacroValuMXSBPack.add(RegSet("v", "vgprValuMXSB_X%u_I%u_D%u"%(bi,iui,data), "vgprValuMXSB_X0_I0_D0_PACK", ri)) + ri += self.states.mxsb.numVgprValuPerBlock + + if kernel["ProblemType"]["MXBlockA"] and (self.states.mxsa.startVgprG2L is not None): + moduleVgprMacroMXS.add(RegSet("v", "vgprG2LMXSA_BASE", "vgprMXSBase", self.states.mxsa.startVgprG2L)) + if kernel["ProblemType"]["MXBlockB"] and (self.states.mxsb.startVgprG2L is not None): + moduleVgprMacroMXS.add(RegSet("v", "vgprG2LMXSB_BASE", "vgprMXSBase", self.states.mxsb.startVgprG2L)) + + if kernel["ProblemType"]["MXBlockA"]: + if not kernel["DirectToLdsMXSA"] or self.do["KeepDirectToLdsAlloc"]: + moduleVgprMacroG2LMXSA.add(RegSet("v", "vgprG2LMXSA", "vgprG2LMXSA_BASE", 0)) + if kernel["DirectToVgprMXSA"]: + # additional definition G2LA2 for swapping register sets + moduleVgprMacroG2LMXSA.add(RegSet("v", "vgprG2LMXSA2", "vgprG2LMXSA_BASE", self.states.mxsa.numVgprG2LAllocated//2)) + + if kernel["ProblemType"]["MXBlockB"]: + if not kernel["DirectToLdsMXSB"] or self.do["KeepDirectToLdsAlloc"]: + moduleVgprMacroG2LMXSB.add(RegSet("v", "vgprG2LMXSB", "vgprG2LMXSB_BASE", 0)) + if kernel["DirectToVgprB"]: + # additional definition G2LB2 for swapping register sets + moduleVgprMacroG2LMXSB.add(RegSet("v", "vgprG2LMXSB2", "vgprG2LMXSB_BASE", self.states.mxsb.numVgprG2LAllocated//2)) + + self.moduleVgprMacroMXS = moduleVgprMacroMXS + self.moduleVgprMacroValuMXSDummy = moduleVgprMacroValuMXSDummy + self.moduleVgprMacroValuMXSA = moduleVgprMacroValuMXSA + self.moduleVgprMacroValuMXSB = moduleVgprMacroValuMXSB + self.moduleVgprMacroValuMXSAPack = moduleVgprMacroValuMXSAPack + self.moduleVgprMacroValuMXSBPack = moduleVgprMacroValuMXSBPack + self.moduleVgprMacroG2LMXSA = moduleVgprMacroG2LMXSA + self.moduleVgprMacroG2LMXSB = moduleVgprMacroG2LMXSB + module.addComment2("VGPR Macro Assignments for MX") + module.add(self.moduleVgprMacroMXS) + module.add(moduleVgprMacroValuMXSDummy) + module.add(self.moduleVgprMacroValuMXSA) + module.add(self.moduleVgprMacroValuMXSB) + module.add(self.moduleVgprMacroValuMXSAPack) + module.add(self.moduleVgprMacroValuMXSBPack) + module.add(self.moduleVgprMacroG2LMXSA) + module.add(self.moduleVgprMacroG2LMXSB) + + module.addComment2("VGPR Assignments") + module.addComment0("ValuC range: [%u-%u), %s"%(self.states.c.startVgprValu, self.states.c.startVgprValu+self.states.c.numVgprValu, \ + "serializedStore enabled" if self.states.serializedStore else "")) + module.add(RegSet("v", "vgprValuC", self.states.c.startVgprValu)) + + module.addComment0("ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx") + # PLR index: from X0 to X (at most) -> VGPRs will be duplicated LoopIters times (at most) + # eg, if LoopIters = 4, there would be at most 4*VGPRs + moduleVgprMacro = Module("VALU/G2L Vgpr Macro") + moduleVgprMacroValuA = Module("VALUA Vgpr Macro") + moduleVgprMacroValuB = Module("VALUB Vgpr Macro") + moduleVgprMacroValuAPack = Module("VALUA Pack Vgpr Macro") + moduleVgprMacroValuBPack = Module("VALUB Pack Vgpr Macro") + moduleVgprMacroValuM = Module("VALUMetadata Vgpr Macro") + moduleVgprMacroValuMPack = Module("VALUMetadata Pack Vgpr Macro") + moduleVgprMacroG2LA = Module("G2LA Vgpr Macro") + moduleVgprMacroG2LB = Module("G2LB Vgpr Macro") + module.add(RegSet("v", "vgprBase", self.states.startVgpr)) - if kernel["ProblemType"]["MXBlockA"]: ri = 0 - if self.states.mxsa.numVgprValu > 0: # Do not generate vgprValuMXSA if numVgprValuA is 0 + if self.states.a.numVgprValu > 0: # Do not generate vgprValuA if numVgprValuA is 0 numBiFactor = numBi if kernel["DirectToVgprA"] and (self.states.packDTVA or self.states.convDTVA): # DirectToVgpr case, we need LoopIters * 2 buffers numBiFactor = kernel["LoopIters"] * 2 - if self.states.lrvwTileMXSA > 1: - moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSA_X0_I0_BASE", "vgprMXSBase", self.states.mxsa.startVgprValu)) + if self.states.lrvwTileA > 1: + moduleVgprMacro.add(RegSet("v", "vgprValuA_X0_I0_BASE", "vgprBase", self.states.a.startVgprValu - self.states.startVgpr)) for bi in range(0,numBiFactor): # buffer indices for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuMXSA.add(RegSet("v", "vgprValuMXSA_X%u_I%u"%(bi,iui), "vgprValuMXSA_X0_I0_BASE", ri)) - ri += self.states.mxsa.numVgprValuPerBlock - if not kernel["UnrollMajorLDSA"]: + moduleVgprMacroValuA.add(RegSet("v", "vgprValuA_X%u_I%u"%(bi,iui), "vgprValuA_X0_I0_BASE", ri)) + ri += self.states.a.numVgprValuPerBlock + if tPA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not (kernel["UsePLRPack"] and self.states.numItersPLR): ri = 0 ri = 0 - if not kernel["UnrollMajorLDSA"]: - moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSA_X0_I0_D0_PACK", "vgprMXSBase", self.states.mxsa.startVgprValuPack)) + if tPA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not kernel["enableLDSTrA"]: + moduleVgprMacro.add(RegSet("v", "vgprValuA_X0_I0_D0_PACK", "vgprBase", self.states.a.startVgprValuPack - self.states.startVgpr)) for bi in range(0,numBiFactor): # buffer indices for iui in range(0, kernel["InnerUnroll"]): - for data in range(0,kernel["MIInputPerThreadMXSA"]): - moduleVgprMacroValuMXSAPack.add(RegSet("v", "vgprValuMXSA_X%u_I%u_D%u"%(bi,iui,data),"vgprValuMXSA_X0_I0_D0_PACK", ri)) - ri += ceil(kernel["VectorWidthMXSA"] / self.states.bpr) * kernel["MIWaveTileA"] // kernel["VectorWidthMXSA"] + for data in range(0,kernel["MIInputPerThreadA"]): + moduleVgprMacroValuAPack.add(RegSet("v", "vgprValuA_X%u_I%u_D%u"%(bi,iui,data),"vgprValuA_X0_I0_D0_PACK", ri)) + ri += ceil(kernel["VectorWidthA"] * tPA["bpe"] / self.states.bpr) * kernel["MIWaveTileA"] // kernel["VectorWidthA"] + if (kernel["UsePLRPack"] and self.states.numItersPLR): + ri = 0 else: - moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSA_X0_I0_BASE", "vgprMXSBase", self.states.mxsa.startVgprValu)) + moduleVgprMacro.add(RegSet("v", "vgprValuA_X0_I0_BASE", "vgprBase", self.states.a.startVgprValu - self.states.startVgpr)) for bi in range(0,numBiFactor): # buffer indices for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuMXSA.add(RegSet("v", "vgprValuMXSA_X%u_I%u"%(bi,iui), "vgprValuMXSA_X0_I0_BASE", ri)) - ri += self.states.mxsa.numVgprValuPerBlock + moduleVgprMacroValuA.add(RegSet("v", "vgprValuA_X%u_I%u"%(bi,iui), "vgprValuA_X0_I0_BASE", ri)) + ri += self.states.a.numVgprValuPerBlock ri = 0 - if not kernel["UnrollMajorLDSA"]: - moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSA_X0_I0_D0_PACK", "vgprMXSBase", self.states.mxsa.startVgprValuPack)) - for data in range(1,int(self.states.bpr)): + if tPA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not kernel["enableLDSTrA"]: + moduleVgprMacro.add(RegSet("v", "vgprValuA_X0_I0_D0_PACK", "vgprBase", self.states.a.startVgprValuPack - self.states.startVgpr)) + for data in range(1,int(self.states.bpr/tPA["bpeDS"])): for bi in range(0,numBiFactor): # buffer indices - if bi % self.states.numVgprBufferPackMXSA == 0: - ri = (data-1) * kernel["InnerUnroll"] * self.states.numVgprBufferPackMXSA * self.states.mxsa.numVgprValuPerBlock + if bi % self.states.numVgprBufferPackA == 0: + ri = (data-1) * kernel["InnerUnroll"] * self.states.numVgprBufferPackA * self.states.a.numVgprValuPerBlock for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuMXSAPack.add(RegSet("v", "vgprValuMXSA_X%u_I%u_D%u"%(bi,iui,data),"vgprValuMXSA_X0_I0_D0_PACK", ri)) - ri += self.states.mxsa.numVgprValuPerBlock + moduleVgprMacroValuAPack.add(RegSet("v", "vgprValuA_X%u_I%u_D%u"%(bi,iui,data),"vgprValuA_X0_I0_D0_PACK", ri)) + ri += self.states.a.numVgprValuPerBlock - if kernel["ProblemType"]["MXBlockB"]: ri = 0 - if self.states.mxsb.numVgprValu > 0: # Do not generate vgprValuMXSB if numVgprValuB is 0 + if self.states.b.numVgprValu > 0: # Do not generate vgprValuB if numVgprValuB is 0 numBiFactor = numBi if kernel["DirectToVgprB"] and (self.states.packDTVB or self.states.convDTVB): # DirectToVgpr case, we need LoopIters * 2 buffers numBiFactor = kernel["LoopIters"] * 2 - if self.states.lrvwTileMXSB > 1: - moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSB_X0_I0_BASE", "vgprMXSBase", self.states.mxsb.startVgprValu)) + if self.states.lrvwTileB > 1: + moduleVgprMacro.add(RegSet("v", "vgprValuB_X0_I0_BASE", "vgprBase", self.states.b.startVgprValu - self.states.startVgpr)) for bi in range(0,numBiFactor): # buffer indices for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuMXSB.add(RegSet("v", "vgprValuMXSB_X%u_I%u"%(bi,iui), "vgprValuMXSB_X0_I0_BASE", ri)) - ri += self.states.mxsb.numVgprValuPerBlock - if not kernel["UnrollMajorLDSB"]: + moduleVgprMacroValuB.add(RegSet("v", "vgprValuB_X%u_I%u"%(bi,iui), "vgprValuB_X0_I0_BASE", ri)) + ri += self.states.b.numVgprValuPerBlock + if (tPB["bpe"] < 4 and not kernel["UnrollMajorLDSB"]) and not (kernel["UsePLRPack"] and self.states.numItersPLR): ri = 0 ri = 0 - if not kernel["UnrollMajorLDSB"]: - moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSB_X0_I0_D0_PACK", "vgprMXSBase", self.states.mxsb.startVgprValuPack)) + if tPB["bpe"] < 4 and not kernel["UnrollMajorLDSB"] and not kernel["enableLDSTrB"]: + moduleVgprMacro.add(RegSet("v", "vgprValuB_X0_I0_D0_PACK", "vgprBase", self.states.b.startVgprValuPack - self.states.startVgpr)) for bi in range(0,numBiFactor): # buffer indices for iui in range(0, kernel["InnerUnroll"]): - for data in range(0,kernel["MIInputPerThreadMXSB"]): - moduleVgprMacroValuMXSBPack.add(RegSet("v", "vgprValuMXSB_X%u_I%u_D%u"%(bi,iui,data), "vgprValuMXSB_X0_I0_D0_PACK", ri)) - ri += ceil(kernel["VectorWidthMXSB"] / self.states.bpr) * kernel["MIWaveTileB"] // kernel["VectorWidthMXSB"] + for data in range(0,kernel["MIInputPerThreadB"]): + moduleVgprMacroValuBPack.add(RegSet("v", "vgprValuB_X%u_I%u_D%u"%(bi,iui,data), "vgprValuB_X0_I0_D0_PACK", ri)) + ri += ceil(kernel["VectorWidthB"] * tPB["bpe"] / self.states.bpr) * kernel["MIWaveTileB"] // kernel["VectorWidthB"] + if (kernel["UsePLRPack"] and self.states.numItersPLR): + ri = 0 else: - moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSB_X0_I0_BASE", "vgprMXSBase", self.states.mxsb.startVgprValu)) + moduleVgprMacro.add(RegSet("v", "vgprValuB_X0_I0_BASE", "vgprBase", self.states.b.startVgprValu - self.states.startVgpr)) for bi in range(0,numBiFactor): # buffer indices for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuMXSB.add(RegSet("v", "vgprValuMXSB_X%u_I%u"%(bi,iui), "vgprValuMXSB_X0_I0_BASE", ri)) - ri += self.states.mxsb.numVgprValuPerBlock + moduleVgprMacroValuB.add(RegSet("v", "vgprValuB_X%u_I%u"%(bi,iui), "vgprValuB_X0_I0_BASE", ri)) + ri += self.states.b.numVgprValuPerBlock ri = 0 - if not kernel["UnrollMajorLDSB"]: - moduleVgprMacroMXS.add(RegSet("v", "vgprValuMXSB_X0_I0_D0_PACK", "vgprMXSBase", self.states.mxsb.startVgprValuPack)) - for data in range(1,int(self.states.bpr)): + if tPB["bpe"] < 4 and not kernel["UnrollMajorLDSB"] and not kernel["enableLDSTrB"]: + moduleVgprMacro.add(RegSet("v", "vgprValuB_X0_I0_D0_PACK", "vgprBase", self.states.b.startVgprValuPack - self.states.startVgpr)) + for data in range(1,int(self.states.bpr/tPB["bpeDS"])): for bi in range(0,numBiFactor): # buffer indices - if bi % self.states.numVgprBufferPackMXSB == 0: - ri = (data-1) * kernel["InnerUnroll"] * self.states.numVgprBufferPackMXSB * self.states.mxsb.numVgprValuPerBlock + if bi % self.states.numVgprBufferPackB == 0: + ri = (data-1) * kernel["InnerUnroll"] * self.states.numVgprBufferPackB * self.states.b.numVgprValuPerBlock for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuMXSBPack.add(RegSet("v", "vgprValuMXSB_X%u_I%u_D%u"%(bi,iui,data), "vgprValuMXSB_X0_I0_D0_PACK", ri)) - ri += self.states.mxsb.numVgprValuPerBlock + moduleVgprMacroValuBPack.add(RegSet("v", "vgprValuB_X%u_I%u_D%u"%(bi,iui,data), "vgprValuB_X0_I0_D0_PACK", ri)) + ri += self.states.b.numVgprValuPerBlock - if kernel["ProblemType"]["MXBlockA"] and (self.states.mxsa.startVgprG2L is not None): - moduleVgprMacroMXS.add(RegSet("v", "vgprG2LMXSA_BASE", "vgprMXSBase", self.states.mxsa.startVgprG2L)) - if kernel["ProblemType"]["MXBlockB"] and (self.states.mxsb.startVgprG2L is not None): - moduleVgprMacroMXS.add(RegSet("v", "vgprG2LMXSB_BASE", "vgprMXSBase", self.states.mxsb.startVgprG2L)) - if kernel["ProblemType"]["MXBlockA"]: - if not kernel["DirectToLdsMXSA"] or self.do["KeepDirectToLdsAlloc"]: - moduleVgprMacroG2LMXSA.add(RegSet("v", "vgprG2LMXSA", "vgprG2LMXSA_BASE", 0)) - if kernel["DirectToVgprMXSA"]: - # additional definition G2LA2 for swapping register sets - moduleVgprMacroG2LMXSA.add(RegSet("v", "vgprG2LMXSA2", "vgprG2LMXSA_BASE", self.states.mxsa.numVgprG2LAllocated//2)) + # T reg definition for F32XEmu + self.macroAndSetF32XEmuTreg(kernel, tPA, tPB) - if kernel["ProblemType"]["MXBlockB"]: - if not kernel["DirectToLdsMXSB"] or self.do["KeepDirectToLdsAlloc"]: - moduleVgprMacroG2LMXSB.add(RegSet("v", "vgprG2LMXSB", "vgprG2LMXSB_BASE", 0)) - if kernel["DirectToVgprB"]: - # additional definition G2LB2 for swapping register sets - moduleVgprMacroG2LMXSB.add(RegSet("v", "vgprG2LMXSB2", "vgprG2LMXSB_BASE", self.states.mxsb.numVgprG2LAllocated//2)) - - self.moduleVgprMacroMXS = moduleVgprMacroMXS - self.moduleVgprMacroValuMXSDummy = moduleVgprMacroValuMXSDummy - self.moduleVgprMacroValuMXSA = moduleVgprMacroValuMXSA - self.moduleVgprMacroValuMXSB = moduleVgprMacroValuMXSB - self.moduleVgprMacroValuMXSAPack = moduleVgprMacroValuMXSAPack - self.moduleVgprMacroValuMXSBPack = moduleVgprMacroValuMXSBPack - self.moduleVgprMacroG2LMXSA = moduleVgprMacroG2LMXSA - self.moduleVgprMacroG2LMXSB = moduleVgprMacroG2LMXSB - module.addComment2("VGPR Macro Assignments for MX") - module.add(self.moduleVgprMacroMXS) - module.add(moduleVgprMacroValuMXSDummy) - module.add(self.moduleVgprMacroValuMXSA) - module.add(self.moduleVgprMacroValuMXSB) - module.add(self.moduleVgprMacroValuMXSAPack) - module.add(self.moduleVgprMacroValuMXSBPack) - module.add(self.moduleVgprMacroG2LMXSA) - module.add(self.moduleVgprMacroG2LMXSB) - - module.addComment2("VGPR Assignments") - module.addComment0("ValuC range: [%u-%u), %s"%(self.states.c.startVgprValu, self.states.c.startVgprValu+self.states.c.numVgprValu, \ - "serializedStore enabled" if self.states.serializedStore else "")) - module.add(RegSet("v", "vgprValuC", self.states.c.startVgprValu)) - - module.addComment0("ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx") - # PLR index: from X0 to X (at most) -> VGPRs will be duplicated LoopIters times (at most) - # eg, if LoopIters = 4, there would be at most 4*VGPRs - moduleVgprMacro = Module("VALU/G2L Vgpr Macro") - moduleVgprMacroValuA = Module("VALUA Vgpr Macro") - moduleVgprMacroValuB = Module("VALUB Vgpr Macro") - moduleVgprMacroValuAPack = Module("VALUA Pack Vgpr Macro") - moduleVgprMacroValuBPack = Module("VALUB Pack Vgpr Macro") - moduleVgprMacroValuM = Module("VALUMetadata Vgpr Macro") - moduleVgprMacroValuMPack = Module("VALUMetadata Pack Vgpr Macro") - moduleVgprMacroG2LA = Module("G2LA Vgpr Macro") - moduleVgprMacroG2LB = Module("G2LB Vgpr Macro") - module.add(RegSet("v", "vgprBase", self.states.startVgpr)) - - ri = 0 - if self.states.a.numVgprValu > 0: # Do not generate vgprValuA if numVgprValuA is 0 - numBiFactor = numBi - if kernel["DirectToVgprA"] and (self.states.packDTVA or self.states.convDTVA): - # DirectToVgpr case, we need LoopIters * 2 buffers - numBiFactor = kernel["LoopIters"] * 2 - if self.states.lrvwTileA > 1: - moduleVgprMacro.add(RegSet("v", "vgprValuA_X0_I0_BASE", "vgprBase", self.states.a.startVgprValu - self.states.startVgpr)) - for bi in range(0,numBiFactor): # buffer indices - for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuA.add(RegSet("v", "vgprValuA_X%u_I%u"%(bi,iui), "vgprValuA_X0_I0_BASE", ri)) - ri += self.states.a.numVgprValuPerBlock - if tPA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not (kernel["UsePLRPack"] and self.states.numItersPLR): - ri = 0 - ri = 0 - if tPA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not kernel["enableLDSTrA"]: - moduleVgprMacro.add(RegSet("v", "vgprValuA_X0_I0_D0_PACK", "vgprBase", self.states.a.startVgprValuPack - self.states.startVgpr)) - for bi in range(0,numBiFactor): # buffer indices - for iui in range(0, kernel["InnerUnroll"]): - for data in range(0,kernel["MIInputPerThreadA"]): - moduleVgprMacroValuAPack.add(RegSet("v", "vgprValuA_X%u_I%u_D%u"%(bi,iui,data),"vgprValuA_X0_I0_D0_PACK", ri)) - ri += ceil(kernel["VectorWidthA"] * tPA["bpe"] / self.states.bpr) * kernel["MIWaveTileA"] // kernel["VectorWidthA"] - if (kernel["UsePLRPack"] and self.states.numItersPLR): - ri = 0 - else: - moduleVgprMacro.add(RegSet("v", "vgprValuA_X0_I0_BASE", "vgprBase", self.states.a.startVgprValu - self.states.startVgpr)) - for bi in range(0,numBiFactor): # buffer indices - for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuA.add(RegSet("v", "vgprValuA_X%u_I%u"%(bi,iui), "vgprValuA_X0_I0_BASE", ri)) - ri += self.states.a.numVgprValuPerBlock - ri = 0 - if tPA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not kernel["enableLDSTrA"]: - moduleVgprMacro.add(RegSet("v", "vgprValuA_X0_I0_D0_PACK", "vgprBase", self.states.a.startVgprValuPack - self.states.startVgpr)) - for data in range(1,int(self.states.bpr/tPA["bpeDS"])): - for bi in range(0,numBiFactor): # buffer indices - if bi % self.states.numVgprBufferPackA == 0: - ri = (data-1) * kernel["InnerUnroll"] * self.states.numVgprBufferPackA * self.states.a.numVgprValuPerBlock - for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuAPack.add(RegSet("v", "vgprValuA_X%u_I%u_D%u"%(bi,iui,data),"vgprValuA_X0_I0_D0_PACK", ri)) - ri += self.states.a.numVgprValuPerBlock - - ri = 0 - if self.states.b.numVgprValu > 0: # Do not generate vgprValuB if numVgprValuB is 0 - numBiFactor = numBi - if kernel["DirectToVgprB"] and (self.states.packDTVB or self.states.convDTVB): - # DirectToVgpr case, we need LoopIters * 2 buffers - numBiFactor = kernel["LoopIters"] * 2 - if self.states.lrvwTileB > 1: - moduleVgprMacro.add(RegSet("v", "vgprValuB_X0_I0_BASE", "vgprBase", self.states.b.startVgprValu - self.states.startVgpr)) - for bi in range(0,numBiFactor): # buffer indices - for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuB.add(RegSet("v", "vgprValuB_X%u_I%u"%(bi,iui), "vgprValuB_X0_I0_BASE", ri)) - ri += self.states.b.numVgprValuPerBlock - if (tPB["bpe"] < 4 and not kernel["UnrollMajorLDSB"]) and not (kernel["UsePLRPack"] and self.states.numItersPLR): - ri = 0 - ri = 0 - if tPB["bpe"] < 4 and not kernel["UnrollMajorLDSB"] and not kernel["enableLDSTrB"]: - moduleVgprMacro.add(RegSet("v", "vgprValuB_X0_I0_D0_PACK", "vgprBase", self.states.b.startVgprValuPack - self.states.startVgpr)) - for bi in range(0,numBiFactor): # buffer indices - for iui in range(0, kernel["InnerUnroll"]): - for data in range(0,kernel["MIInputPerThreadB"]): - moduleVgprMacroValuBPack.add(RegSet("v", "vgprValuB_X%u_I%u_D%u"%(bi,iui,data), "vgprValuB_X0_I0_D0_PACK", ri)) - ri += ceil(kernel["VectorWidthB"] * tPB["bpe"] / self.states.bpr) * kernel["MIWaveTileB"] // kernel["VectorWidthB"] - if (kernel["UsePLRPack"] and self.states.numItersPLR): - ri = 0 - else: - moduleVgprMacro.add(RegSet("v", "vgprValuB_X0_I0_BASE", "vgprBase", self.states.b.startVgprValu - self.states.startVgpr)) - for bi in range(0,numBiFactor): # buffer indices - for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuB.add(RegSet("v", "vgprValuB_X%u_I%u"%(bi,iui), "vgprValuB_X0_I0_BASE", ri)) - ri += self.states.b.numVgprValuPerBlock - ri = 0 - if tPB["bpe"] < 4 and not kernel["UnrollMajorLDSB"] and not kernel["enableLDSTrB"]: - moduleVgprMacro.add(RegSet("v", "vgprValuB_X0_I0_D0_PACK", "vgprBase", self.states.b.startVgprValuPack - self.states.startVgpr)) - for data in range(1,int(self.states.bpr/tPB["bpeDS"])): - for bi in range(0,numBiFactor): # buffer indices - if bi % self.states.numVgprBufferPackB == 0: - ri = (data-1) * kernel["InnerUnroll"] * self.states.numVgprBufferPackB * self.states.b.numVgprValuPerBlock - for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuBPack.add(RegSet("v", "vgprValuB_X%u_I%u_D%u"%(bi,iui,data), "vgprValuB_X0_I0_D0_PACK", ri)) - ri += self.states.b.numVgprValuPerBlock + if kernel["ConvertAfterDS"]: + cvtTemp = max(self.states.a.startVgprValuCvtTemp, self.states.b.startVgprValuCvtTemp) + if (cvtTemp != -1): + moduleVgprMacro.add(RegSet("v", "vgprCvtTemp", "vgprBase", cvtTemp - self.states.startVgpr)) + if kernel["ProblemType"]["Gradient"] and kernel["ProblemType"]["UseBias"] and (kernel["ProblemType"]["BiasSrc"] == "A" or kernel["ProblemType"]["BiasSrc"] == "B"): + moduleVgprMacro.add(RegSet("v", "vgprValuSum", "vgprBase", self.states.bias.startVgprValu - self.states.startVgpr)) - # T reg definition for F32XEmu - self.macroAndSetF32XEmuTreg(kernel, tPA, tPB) - - if kernel["ConvertAfterDS"]: - cvtTemp = max(self.states.a.startVgprValuCvtTemp, self.states.b.startVgprValuCvtTemp) - if (cvtTemp != -1): - moduleVgprMacro.add(RegSet("v", "vgprCvtTemp", "vgprBase", cvtTemp - self.states.startVgpr)) - - if kernel["ProblemType"]["Gradient"] and kernel["ProblemType"]["UseBias"] and (kernel["ProblemType"]["BiasSrc"] == "A" or kernel["ProblemType"]["BiasSrc"] == "B"): - moduleVgprMacro.add(RegSet("v", "vgprValuSum", "vgprBase", self.states.bias.startVgprValu - self.states.startVgpr)) - - if kernel["ProblemType"]["Sparse"]: - if kernel["DirectToVgprSparseMetadata"]: - moduleVgprMacro.add(RegSet("v", "vgprValuMetadata_X0_I0_BASE", "vgprBase", self.states.m.startVgprValu - self.states.startVgpr)) - moduleVgprMacroValuM.add(RegSet("v", "vgprValuMetadata", "vgprValuMetadata_X0_I0_BASE", 0)) - else: - ri = 0 - if self.states.m.numVgprValu > 0: # Do not generate vgprValu if numVgprValu is 0 - if self.states.lrvwTileMetadata > 1: - moduleVgprMacro.add(RegSet("v", "vgprValuMetadata_X0_I0_BASE", "vgprBase", self.states.m.startVgprValu - self.states.startVgpr)) - for bi in range(0,PLR): # buffer indices - for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuM.add(RegSet("v", "vgprValuMetadata_X%u_I%u"%(bi,iui), "vgprValuMetadata_X0_I0_BASE", ri)) - ri += self.states.m.numVgprValuPerBlock - if not kernel["UnrollMajorLDSMetadata"]: - ri = 0 - ri = 0 - if not kernel["UnrollMajorLDSMetadata"] and not kernel["enableLDSTrMetadata"]: - miWaveTile = kernel["MIWaveTileB"] if kernel["ProblemType"]["Sparse"] == 2 else kernel["MIWaveTileA"] - moduleVgprMacro.add(RegSet("v", "vgprValuMetadata_X0_I0_D0_PACK", "vgprBase", self.states.m.startVgprValuPack - self.states.startVgpr)) - for data in range(0,kernel["MIInputPerThreadMetadata"]): - for bi in range(0,PLR): # buffer indices - for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuMPack.add(RegSet("v", "vgprValuMetadata_X%u_I%u_D%u"%(bi,iui,data), "vgprValuMetadata_X0_I0_D0_PACK", ri)) - ri += ceil(kernel["VectorWidthMetadata"] * tPM["bpe"] / self.states.bpr) * miWaveTile // kernel["VectorWidthMetadata"] - else: - moduleVgprMacro.add(RegSet("v", "vgprValuMetadata_X0_I0_BASE", "vgprBase", self.states.m.startVgprValu - self.states.startVgpr)) - for bi in range(0,PLR): # buffer indices - for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuM.add(RegSet("v", "vgprValuMetadata_X%u_I%u"%(bi,iui), "vgprValuMetadata_X0_I0_BASE", ri)) - ri += self.states.m.numVgprValuPerBlock - if not kernel["UnrollMajorLDSMetadata"] and not kernel["enableLDSTrMetadata"]: - moduleVgprMacro.add(RegSet("v", "vgprValuMetadata_X0_I0_D0_PACK", "vgprBase", self.states.m.startVgprValuPack - self.states.startVgpr)) - for data in range(1,kernel["MIInputPerThreadMetadata"]): - for bi in range(0,PLR): # buffer indices - if bi % self.states.numVgprBufferPackMetadata == 0: - ri = (data-1) * kernel["InnerUnroll"] * self.states.numVgprBufferPackMetadata * kernel["MIWaveTileMetadata"] - for iui in range(0, kernel["InnerUnroll"]): - moduleVgprMacroValuMPack.add(RegSet("v", "vgprValuMetadata_X%u_I%u_D%u"%(bi,iui,data),"vgprValuMetadata_X0_I0_D0_PACK", ri)) - ri += kernel["MIWaveTileMetadata"] - - if not kernel["LocalWriteUseSgprA"] and self.states.a.numVgprLocalWriteAddr > 0: - module.add(RegSet("v", "vgprLocalWriteAddrA", \ - self.states.a.startVgprLocalWriteAddr)) - if self.states.a.numVgprLocalWriteAddr > 1: - module.add(RegSet("v", "vgprLocalWriteAddrOverhangA", \ - self.states.a.startVgprLocalWriteAddr+1)) - if kernel["ProblemType"]["MXBlockA"]: - if not kernel["LocalWriteUseSgprMXSA"] and self.states.mxsa.numVgprLocalWriteAddr > 0: - module.add(RegSet("v", "vgprLocalWriteAddrMXSA", \ - self.states.mxsa.startVgprLocalWriteAddr)) - if self.states.mxsa.numVgprLocalWriteAddr > 1: - module.add(RegSet("v", "vgprLocalWriteAddrOverhangMXSA", \ - self.states.mxsa.startVgprLocalWriteAddr+1)) - if not kernel["LocalWriteUseSgprB"] and self.states.b.numVgprLocalWriteAddr > 0: - module.add(RegSet("v", "vgprLocalWriteAddrB", \ - self.states.b.startVgprLocalWriteAddr)) - if self.states.b.numVgprLocalWriteAddr > 1: - module.add(RegSet("v", "vgprLocalWriteAddrOverhangB", \ - self.states.b.startVgprLocalWriteAddr+1)) - if kernel["ProblemType"]["MXBlockB"]: - if not kernel["LocalWriteUseSgprMXSB"] and self.states.mxsb.numVgprLocalWriteAddr > 0: - module.add(RegSet("v", "vgprLocalWriteAddrMXSB", \ - self.states.mxsb.startVgprLocalWriteAddr)) - if self.states.mxsb.numVgprLocalWriteAddr > 1: - module.add(RegSet("v", "vgprLocalWriteAddrOverhangMXSB", \ - self.states.mxsb.startVgprLocalWriteAddr+1)) - if self.states.m.numVgprLocalWriteAddr > 0: - module.add(RegSet("v", "vgprLocalWriteAddrMetadata", \ - self.states.m.startVgprLocalWriteAddr)) - if self.states.m.numVgprLocalWriteAddr > 1: - module.add(RegSet("v", "vgprLocalWriteAddrOverhangMetadata", \ - self.states.m.startVgprLocalWriteAddr+1)) - if kernel["BufferLoad"]: - module.add(RegSet("v", "vgprGlobalReadOffsetA", \ - self.startVgprGlobalReadOffsetA)) - if kernel["ProblemType"]["MXBlockA"]: - module.add(RegSet("v", "vgprGlobalReadOffsetMXSA", \ - self.startVgprGlobalReadOffsetMXSA)) - module.add(RegSet("v", "vgprGlobalReadOffsetB", \ - self.startVgprGlobalReadOffsetB)) - if kernel["ProblemType"]["MXBlockB"]: - module.add(RegSet("v", "vgprGlobalReadOffsetMXSB", \ - self.startVgprGlobalReadOffsetMXSB)) if kernel["ProblemType"]["Sparse"]: - module.add(RegSet("v", "vgprGlobalReadOffsetMetadata", \ - self.startVgprGlobalReadOffsetMetadata)) - else: - module.add(RegSet("v", "vgprGlobalReadAddrA", \ - self.startVgprGlobalReadAddressesA)) + if kernel["DirectToVgprSparseMetadata"]: + moduleVgprMacro.add(RegSet("v", "vgprValuMetadata_X0_I0_BASE", "vgprBase", self.states.m.startVgprValu - self.states.startVgpr)) + moduleVgprMacroValuM.add(RegSet("v", "vgprValuMetadata", "vgprValuMetadata_X0_I0_BASE", 0)) + else: + ri = 0 + if self.states.m.numVgprValu > 0: # Do not generate vgprValu if numVgprValu is 0 + if self.states.lrvwTileMetadata > 1: + moduleVgprMacro.add(RegSet("v", "vgprValuMetadata_X0_I0_BASE", "vgprBase", self.states.m.startVgprValu - self.states.startVgpr)) + for bi in range(0,PLR): # buffer indices + for iui in range(0, kernel["InnerUnroll"]): + moduleVgprMacroValuM.add(RegSet("v", "vgprValuMetadata_X%u_I%u"%(bi,iui), "vgprValuMetadata_X0_I0_BASE", ri)) + ri += self.states.m.numVgprValuPerBlock + if not kernel["UnrollMajorLDSMetadata"]: + ri = 0 + ri = 0 + if not kernel["UnrollMajorLDSMetadata"] and not kernel["enableLDSTrMetadata"]: + miWaveTile = kernel["MIWaveTileB"] if kernel["ProblemType"]["Sparse"] == 2 else kernel["MIWaveTileA"] + moduleVgprMacro.add(RegSet("v", "vgprValuMetadata_X0_I0_D0_PACK", "vgprBase", self.states.m.startVgprValuPack - self.states.startVgpr)) + for data in range(0,kernel["MIInputPerThreadMetadata"]): + for bi in range(0,PLR): # buffer indices + for iui in range(0, kernel["InnerUnroll"]): + moduleVgprMacroValuMPack.add(RegSet("v", "vgprValuMetadata_X%u_I%u_D%u"%(bi,iui,data), "vgprValuMetadata_X0_I0_D0_PACK", ri)) + ri += ceil(kernel["VectorWidthMetadata"] * tPM["bpe"] / self.states.bpr) * miWaveTile // kernel["VectorWidthMetadata"] + else: + moduleVgprMacro.add(RegSet("v", "vgprValuMetadata_X0_I0_BASE", "vgprBase", self.states.m.startVgprValu - self.states.startVgpr)) + for bi in range(0,PLR): # buffer indices + for iui in range(0, kernel["InnerUnroll"]): + moduleVgprMacroValuM.add(RegSet("v", "vgprValuMetadata_X%u_I%u"%(bi,iui), "vgprValuMetadata_X0_I0_BASE", ri)) + ri += self.states.m.numVgprValuPerBlock + if not kernel["UnrollMajorLDSMetadata"] and not kernel["enableLDSTrMetadata"]: + moduleVgprMacro.add(RegSet("v", "vgprValuMetadata_X0_I0_D0_PACK", "vgprBase", self.states.m.startVgprValuPack - self.states.startVgpr)) + for data in range(1,kernel["MIInputPerThreadMetadata"]): + for bi in range(0,PLR): # buffer indices + if bi % self.states.numVgprBufferPackMetadata == 0: + ri = (data-1) * kernel["InnerUnroll"] * self.states.numVgprBufferPackMetadata * kernel["MIWaveTileMetadata"] + for iui in range(0, kernel["InnerUnroll"]): + moduleVgprMacroValuMPack.add(RegSet("v", "vgprValuMetadata_X%u_I%u_D%u"%(bi,iui,data),"vgprValuMetadata_X0_I0_D0_PACK", ri)) + ri += kernel["MIWaveTileMetadata"] + + if not kernel["LocalWriteUseSgprA"] and self.states.a.numVgprLocalWriteAddr > 0: + module.add(RegSet("v", "vgprLocalWriteAddrA", \ + self.states.a.startVgprLocalWriteAddr)) + if self.states.a.numVgprLocalWriteAddr > 1: + module.add(RegSet("v", "vgprLocalWriteAddrOverhangA", \ + self.states.a.startVgprLocalWriteAddr+1)) if kernel["ProblemType"]["MXBlockA"]: - module.add(RegSet("v", "vgprGlobalReadAddrMXSA", \ - self.startVgprGlobalReadAddressesMXSA)) - module.add(RegSet("v", "vgprGlobalReadAddrB", \ - self.startVgprGlobalReadAddressesB)) + if not kernel["LocalWriteUseSgprMXSA"] and self.states.mxsa.numVgprLocalWriteAddr > 0: + module.add(RegSet("v", "vgprLocalWriteAddrMXSA", \ + self.states.mxsa.startVgprLocalWriteAddr)) + if self.states.mxsa.numVgprLocalWriteAddr > 1: + module.add(RegSet("v", "vgprLocalWriteAddrOverhangMXSA", \ + self.states.mxsa.startVgprLocalWriteAddr+1)) + if not kernel["LocalWriteUseSgprB"] and self.states.b.numVgprLocalWriteAddr > 0: + module.add(RegSet("v", "vgprLocalWriteAddrB", \ + self.states.b.startVgprLocalWriteAddr)) + if self.states.b.numVgprLocalWriteAddr > 1: + module.add(RegSet("v", "vgprLocalWriteAddrOverhangB", \ + self.states.b.startVgprLocalWriteAddr+1)) if kernel["ProblemType"]["MXBlockB"]: - module.add(RegSet("v", "vgprGlobalReadAddrMXSB", \ - self.startVgprGlobalReadAddressesMXSB)) - - if self.states.a.startVgprG2L is not None: - moduleVgprMacro.add(RegSet("v", "vgprG2LA_BASE", "vgprBase", self.states.a.startVgprG2L - self.states.startVgpr)) - if self.states.b.startVgprG2L is not None: - moduleVgprMacro.add(RegSet("v", "vgprG2LB_BASE", "vgprBase", self.states.b.startVgprG2L - self.states.startVgpr)) - - if not kernel["DirectToLdsA"] or self.do["KeepDirectToLdsAlloc"]: - moduleVgprMacroG2LA.add(RegSet("v", "vgprG2LA", "vgprG2LA_BASE", 0)) - if kernel["DirectToVgprA"]: - # additional definition G2LA2 for swapping register sets - moduleVgprMacroG2LA.add(RegSet("v", "vgprG2LA2", "vgprG2LA_BASE", self.states.a.numVgprG2LAllocated//2)) - - if not kernel["DirectToLdsB"] or self.do["KeepDirectToLdsAlloc"]: - moduleVgprMacroG2LB.add(RegSet("v", "vgprG2LB", "vgprG2LB_BASE", 0)) - if kernel["DirectToVgprB"]: - # additional definition G2LB2 for swapping register sets - moduleVgprMacroG2LB.add(RegSet("v", "vgprG2LB2", "vgprG2LB_BASE", self.states.b.numVgprG2LAllocated//2)) - - if kernel["UnrollLoopSwapGlobalReadOrder"] and not kernel["DirectToLdsA"] and not kernel["DirectToLdsB"]: - if kernel["ULSGRODoubleG2L"] == 0: - moduleVgprMacroG2LA.add(RegSet("v", "vgprG2LB2", "vgprG2LA_BASE", 0)) - moduleVgprMacroG2LA.add(RegSet("v", "vgprG2LA2", "vgprG2LA_BASE", self.states.b.numVgprG2LAllocated)) + if not kernel["LocalWriteUseSgprMXSB"] and self.states.mxsb.numVgprLocalWriteAddr > 0: + module.add(RegSet("v", "vgprLocalWriteAddrMXSB", \ + self.states.mxsb.startVgprLocalWriteAddr)) + if self.states.mxsb.numVgprLocalWriteAddr > 1: + module.add(RegSet("v", "vgprLocalWriteAddrOverhangMXSB", \ + self.states.mxsb.startVgprLocalWriteAddr+1)) + if self.states.m.numVgprLocalWriteAddr > 0: + module.add(RegSet("v", "vgprLocalWriteAddrMetadata", \ + self.states.m.startVgprLocalWriteAddr)) + if self.states.m.numVgprLocalWriteAddr > 1: + module.add(RegSet("v", "vgprLocalWriteAddrOverhangMetadata", \ + self.states.m.startVgprLocalWriteAddr+1)) + if kernel["BufferLoad"]: + module.add(RegSet("v", "vgprGlobalReadOffsetA", \ + self.startVgprGlobalReadOffsetA)) + if kernel["ProblemType"]["MXBlockA"]: + module.add(RegSet("v", "vgprGlobalReadOffsetMXSA", \ + self.startVgprGlobalReadOffsetMXSA)) + module.add(RegSet("v", "vgprGlobalReadOffsetB", \ + self.startVgprGlobalReadOffsetB)) + if kernel["ProblemType"]["MXBlockB"]: + module.add(RegSet("v", "vgprGlobalReadOffsetMXSB", \ + self.startVgprGlobalReadOffsetMXSB)) + if kernel["ProblemType"]["Sparse"]: + module.add(RegSet("v", "vgprGlobalReadOffsetMetadata", \ + self.startVgprGlobalReadOffsetMetadata)) else: - moduleVgprMacroG2LA.add(RegSet("v", "vgprG2LA2", "vgprG2LA_BASE", self.states.a.numVgprG2LAllocated)) - moduleVgprMacroG2LB.add(RegSet("v", "vgprG2LB2", "vgprG2LB_BASE", self.states.b.numVgprG2LAllocated)) + module.add(RegSet("v", "vgprGlobalReadAddrA", \ + self.startVgprGlobalReadAddressesA)) + if kernel["ProblemType"]["MXBlockA"]: + module.add(RegSet("v", "vgprGlobalReadAddrMXSA", \ + self.startVgprGlobalReadAddressesMXSA)) + module.add(RegSet("v", "vgprGlobalReadAddrB", \ + self.startVgprGlobalReadAddressesB)) + if kernel["ProblemType"]["MXBlockB"]: + module.add(RegSet("v", "vgprGlobalReadAddrMXSB", \ + self.startVgprGlobalReadAddressesMXSB)) - if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - moduleVgprMacro.add(RegSet("v", "vgprG2LMetadata", "vgprBase", self.states.m.startVgprG2L - self.states.startVgpr)) + if self.states.a.startVgprG2L is not None: + moduleVgprMacro.add(RegSet("v", "vgprG2LA_BASE", "vgprBase", self.states.a.startVgprG2L - self.states.startVgpr)) + if self.states.b.startVgprG2L is not None: + moduleVgprMacro.add(RegSet("v", "vgprG2LB_BASE", "vgprBase", self.states.b.startVgprG2L - self.states.startVgpr)) - if ((tPA["bpe"] < 4 and not kernel["UnrollMajorLDSA"]) or \ - (tPB["bpe"] < 4 and not kernel["UnrollMajorLDSB"]) or \ - (kernel["ProblemType"]["Sparse"] and not kernel["UnrollMajorLDSMetadata"] and (kernel["MIInputPerThreadMetadata"] == 4))) \ - and (kernel["ProblemType"]["DataType"].isInt8() or kernel["ProblemType"]["DataType"].is8bitFloat()) or \ - (self.states.asmCaps["HasSWMMAC_gfx1250"] and kernel["ProblemType"]["Sparse"] and not kernel["UnrollMajorLDSMetadata"]): - moduleVgprMacro.add(RegSet("v", "vgprPackTemp", "vgprBase", self.states.a.startVgprValuPackTemp - self.states.startVgpr)) + if not kernel["DirectToLdsA"] or self.do["KeepDirectToLdsAlloc"]: + moduleVgprMacroG2LA.add(RegSet("v", "vgprG2LA", "vgprG2LA_BASE", 0)) + if kernel["DirectToVgprA"]: + # additional definition G2LA2 for swapping register sets + moduleVgprMacroG2LA.add(RegSet("v", "vgprG2LA2", "vgprG2LA_BASE", self.states.a.numVgprG2LAllocated//2)) + + if not kernel["DirectToLdsB"] or self.do["KeepDirectToLdsAlloc"]: + moduleVgprMacroG2LB.add(RegSet("v", "vgprG2LB", "vgprG2LB_BASE", 0)) + if kernel["DirectToVgprB"]: + # additional definition G2LB2 for swapping register sets + moduleVgprMacroG2LB.add(RegSet("v", "vgprG2LB2", "vgprG2LB_BASE", self.states.b.numVgprG2LAllocated//2)) + + if kernel["UnrollLoopSwapGlobalReadOrder"] and not kernel["DirectToLdsA"] and not kernel["DirectToLdsB"]: + if kernel["ULSGRODoubleG2L"] == 0: + moduleVgprMacroG2LA.add(RegSet("v", "vgprG2LB2", "vgprG2LA_BASE", 0)) + moduleVgprMacroG2LA.add(RegSet("v", "vgprG2LA2", "vgprG2LA_BASE", self.states.b.numVgprG2LAllocated)) + else: + moduleVgprMacroG2LA.add(RegSet("v", "vgprG2LA2", "vgprG2LA_BASE", self.states.a.numVgprG2LAllocated)) + moduleVgprMacroG2LB.add(RegSet("v", "vgprG2LB2", "vgprG2LB_BASE", self.states.b.numVgprG2LAllocated)) - if self.states.globalReadIncsUseVgpr: - module.add(RegSet("v", "vgprGlobalReadIncsA", \ - self.startVgprGlobalReadIncsA)) - if kernel["ProblemType"]["MXBlockA"]: - module.add(RegSet("v", "vgprGlobalReadIncsMXSA", \ - self.startVgprGlobalReadIncsMXSA)) - module.add(RegSet("v", "vgprGlobalReadIncsB", \ - self.startVgprGlobalReadIncsB)) - if kernel["ProblemType"]["MXBlockB"]: - module.add(RegSet("v", "vgprGlobalReadIncsMXSB", \ - self.startVgprGlobalReadIncsMXSB)) if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: - module.add(RegSet("v", "vgprGlobalReadIncsMetadata", \ - self.startVgprGlobalReadIncsMetadata)) + moduleVgprMacro.add(RegSet("v", "vgprG2LMetadata", "vgprBase", self.states.m.startVgprG2L - self.states.startVgpr)) + + if ((tPA["bpe"] < 4 and not kernel["UnrollMajorLDSA"]) or \ + (tPB["bpe"] < 4 and not kernel["UnrollMajorLDSB"]) or \ + (kernel["ProblemType"]["Sparse"] and not kernel["UnrollMajorLDSMetadata"] and (kernel["MIInputPerThreadMetadata"] == 4))) \ + and (kernel["ProblemType"]["DataType"].isInt8() or kernel["ProblemType"]["DataType"].is8bitFloat()) or \ + (self.states.asmCaps["HasSWMMAC_gfx1250"] and kernel["ProblemType"]["Sparse"] and not kernel["UnrollMajorLDSMetadata"]): + moduleVgprMacro.add(RegSet("v", "vgprPackTemp", "vgprBase", self.states.a.startVgprValuPackTemp - self.states.startVgpr)) + + if self.states.globalReadIncsUseVgpr: + module.add(RegSet("v", "vgprGlobalReadIncsA", \ + self.startVgprGlobalReadIncsA)) + if kernel["ProblemType"]["MXBlockA"]: + module.add(RegSet("v", "vgprGlobalReadIncsMXSA", \ + self.startVgprGlobalReadIncsMXSA)) + module.add(RegSet("v", "vgprGlobalReadIncsB", \ + self.startVgprGlobalReadIncsB)) + if kernel["ProblemType"]["MXBlockB"]: + module.add(RegSet("v", "vgprGlobalReadIncsMXSB", \ + self.startVgprGlobalReadIncsMXSB)) + if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: + module.add(RegSet("v", "vgprGlobalReadIncsMetadata", \ + self.startVgprGlobalReadIncsMetadata)) - if self.states.a.numVgprLocalReadAddr > 0: - module.add(RegSet("v", "vgprLocalReadAddrA", \ - self.states.a.startVgprLocalReadAddr)) - if kernel["ProblemType"]["MXBlockA"]: - if self.states.mxsa.numVgprLocalReadAddr > 0: - module.add(RegSet("v", "vgprLocalReadAddrMXSA", \ - self.states.mxsa.startVgprLocalReadAddr)) - if self.states.b.numVgprLocalReadAddr > 0: - module.add(RegSet("v", "vgprLocalReadAddrB", \ - self.states.b.startVgprLocalReadAddr)) - if kernel["ProblemType"]["MXBlockB"]: - if self.states.mxsb.numVgprLocalReadAddr > 0: - module.add(RegSet("v", "vgprLocalReadAddrMXSB", \ - self.states.mxsb.startVgprLocalReadAddr)) - if self.states.IncLdsBufSwitch: - # 3 or more LDS buffer case. Need to keep original vgprLocalReadAdd if self.states.a.numVgprLocalReadAddr > 0: - module.add(RegSet("v", "vgprLocalReadAddrOrigA", \ - self.states.a.startVgprLocalReadAddrOrig)) + module.add(RegSet("v", "vgprLocalReadAddrA", \ + self.states.a.startVgprLocalReadAddr)) if kernel["ProblemType"]["MXBlockA"]: if self.states.mxsa.numVgprLocalReadAddr > 0: - module.add(RegSet("v", "vgprLocalReadAddrOrigMXSA", \ - self.states.mxsa.startVgprLocalReadAddrOrig)) + module.add(RegSet("v", "vgprLocalReadAddrMXSA", \ + self.states.mxsa.startVgprLocalReadAddr)) if self.states.b.numVgprLocalReadAddr > 0: - module.add(RegSet("v", "vgprLocalReadAddrOrigB", \ - self.states.b.startVgprLocalReadAddrOrig)) + module.add(RegSet("v", "vgprLocalReadAddrB", \ + self.states.b.startVgprLocalReadAddr)) if kernel["ProblemType"]["MXBlockB"]: if self.states.mxsb.numVgprLocalReadAddr > 0: - module.add(RegSet("v", "vgprLocalReadAddrOrigMXSB", \ - self.states.mxsb.startVgprLocalReadAddrOrig)) - if self.states.m.numVgprLocalReadAddr > 0: - module.add(RegSet("v", "vgprLocalReadAddrMetadata", \ - self.states.m.startVgprLocalReadAddr)) - if self.states.a.numVgprLocalReadSwapAddr > 0: - module.add(RegSet("v", "vgprLocalReadSwapAddrA", \ - self.states.a.startVgprLocalReadSwapAddr)) - if self.states.b.numVgprLocalReadSwapAddr > 0: - module.add(RegSet("v", "vgprLocalReadSwapAddrB", \ - self.states.b.startVgprLocalReadSwapAddr)) - if self.states.mxsa.numVgprLocalReadSwapAddr > 0: - module.add(RegSet("v", "vgprLocalReadSwapAddrMXSA", \ - self.states.mxsa.startVgprLocalReadSwapAddr)) - if self.states.mxsb.numVgprLocalReadSwapAddr > 0: - module.add(RegSet("v", "vgprLocalReadSwapAddrMXSB", \ - self.states.mxsb.startVgprLocalReadSwapAddr)) - if self.states.m.numVgprLocalReadSwapAddr > 0: - module.add(RegSet("v", "vgprLocalReadSwapAddrMetadata", \ - self.states.m.startVgprLocalReadSwapAddr)) - if kernel["ProblemType"]["MXBlockA"]: + module.add(RegSet("v", "vgprLocalReadAddrMXSB", \ + self.states.mxsb.startVgprLocalReadAddr)) + if self.states.IncLdsBufSwitch: + # 3 or more LDS buffer case. Need to keep original vgprLocalReadAdd + if self.states.a.numVgprLocalReadAddr > 0: + module.add(RegSet("v", "vgprLocalReadAddrOrigA", \ + self.states.a.startVgprLocalReadAddrOrig)) + if kernel["ProblemType"]["MXBlockA"]: + if self.states.mxsa.numVgprLocalReadAddr > 0: + module.add(RegSet("v", "vgprLocalReadAddrOrigMXSA", \ + self.states.mxsa.startVgprLocalReadAddrOrig)) + if self.states.b.numVgprLocalReadAddr > 0: + module.add(RegSet("v", "vgprLocalReadAddrOrigB", \ + self.states.b.startVgprLocalReadAddrOrig)) + if kernel["ProblemType"]["MXBlockB"]: + if self.states.mxsb.numVgprLocalReadAddr > 0: + module.add(RegSet("v", "vgprLocalReadAddrOrigMXSB", \ + self.states.mxsb.startVgprLocalReadAddrOrig)) + if self.states.m.numVgprLocalReadAddr > 0: + module.add(RegSet("v", "vgprLocalReadAddrMetadata", \ + self.states.m.startVgprLocalReadAddr)) + if self.states.a.numVgprLocalReadSwapAddr > 0: + module.add(RegSet("v", "vgprLocalReadSwapAddrA", \ + self.states.a.startVgprLocalReadSwapAddr)) + if self.states.b.numVgprLocalReadSwapAddr > 0: + module.add(RegSet("v", "vgprLocalReadSwapAddrB", \ + self.states.b.startVgprLocalReadSwapAddr)) if self.states.mxsa.numVgprLocalReadSwapAddr > 0: module.add(RegSet("v", "vgprLocalReadSwapAddrMXSA", \ self.states.mxsa.startVgprLocalReadSwapAddr)) - if kernel["ProblemType"]["MXBlockB"]: if self.states.mxsb.numVgprLocalReadSwapAddr > 0: module.add(RegSet("v", "vgprLocalReadSwapAddrMXSB", \ self.states.mxsb.startVgprLocalReadSwapAddr)) - if self.states.a.numVgprLocalWriteSwapAddr > 0: - module.add(RegSet("v", "vgprLocalWriteSwapAddrA", \ - self.states.a.startVgprLocalWriteSwapAddr)) - if self.states.b.numVgprLocalWriteSwapAddr > 0: - module.add(RegSet("v", "vgprLocalWriteSwapAddrB", \ - self.states.b.startVgprLocalWriteSwapAddr)) - if self.states.m.numVgprLocalWriteSwapAddr > 0: - module.add(RegSet("v", "vgprLocalWriteSwapAddrMetadata", \ - self.states.m.startVgprLocalWriteSwapAddr)) - if kernel["ProblemType"]["MXBlockA"]: - if self.states.mxsa.numVgprLocalWriteSwapAddr > 0: - module.add(RegSet("v", "vgprLocalWriteSwapAddrMXSA", \ - self.states.mxsa.startVgprLocalWriteSwapAddr)) - if kernel["ProblemType"]["MXBlockB"]: - if self.states.mxsb.numVgprLocalWriteSwapAddr > 0: - module.add(RegSet("v", "vgprLocalWriteSwapAddrMXSB", \ - self.states.mxsb.startVgprLocalWriteSwapAddr)) - - if kernel["ProblemType"]["OutputAmaxD"]: - module.add(RegSet("v", "vgprAmaxOut", self.startVgprAmaxOut)) - module.add(RegSet("v", "vgprAmaxOutB", self.startVgprAmaxOutB)) - - if kernel["ProblemType"]["DataType"].isDoubleComplex() and kernel["MIArchVgpr"]: - module.add(RegSet("v", "vgprAlphaTmp", \ - self.states.startVgprAlphaTmp)) - - module.add(RegSet("v", "vgprSerial", self.states.startVgprSerial)) + if self.states.m.numVgprLocalReadSwapAddr > 0: + module.add(RegSet("v", "vgprLocalReadSwapAddrMetadata", \ + self.states.m.startVgprLocalReadSwapAddr)) + if kernel["ProblemType"]["MXBlockA"]: + if self.states.mxsa.numVgprLocalReadSwapAddr > 0: + module.add(RegSet("v", "vgprLocalReadSwapAddrMXSA", \ + self.states.mxsa.startVgprLocalReadSwapAddr)) + if kernel["ProblemType"]["MXBlockB"]: + if self.states.mxsb.numVgprLocalReadSwapAddr > 0: + module.add(RegSet("v", "vgprLocalReadSwapAddrMXSB", \ + self.states.mxsb.startVgprLocalReadSwapAddr)) + if self.states.a.numVgprLocalWriteSwapAddr > 0: + module.add(RegSet("v", "vgprLocalWriteSwapAddrA", \ + self.states.a.startVgprLocalWriteSwapAddr)) + if self.states.b.numVgprLocalWriteSwapAddr > 0: + module.add(RegSet("v", "vgprLocalWriteSwapAddrB", \ + self.states.b.startVgprLocalWriteSwapAddr)) + if self.states.m.numVgprLocalWriteSwapAddr > 0: + module.add(RegSet("v", "vgprLocalWriteSwapAddrMetadata", \ + self.states.m.startVgprLocalWriteSwapAddr)) + if kernel["ProblemType"]["MXBlockA"]: + if self.states.mxsa.numVgprLocalWriteSwapAddr > 0: + module.add(RegSet("v", "vgprLocalWriteSwapAddrMXSA", \ + self.states.mxsa.startVgprLocalWriteSwapAddr)) + if kernel["ProblemType"]["MXBlockB"]: + if self.states.mxsb.numVgprLocalWriteSwapAddr > 0: + module.add(RegSet("v", "vgprLocalWriteSwapAddrMXSB", \ + self.states.mxsb.startVgprLocalWriteSwapAddr)) - if self.debugConfig.debugKernel: - module.add(RegSet("v", "vgprAddressDbg", \ - self.states.startVgprAddressDbg)) - #module.addComment0("Occu: %u waves/simd" % self.numWavesPerSimd ) - # module.addComment0("Num VGPR=%u"%self.vgprPool.size()) - # module.addComment0("Num AccVGPR=%u"%self.agprPool.size()) - self.moduleVgprMacro = moduleVgprMacro - self.moduleVgprMacroValuA = moduleVgprMacroValuA - self.moduleVgprMacroValuB = moduleVgprMacroValuB - self.moduleVgprMacroValuAPack = moduleVgprMacroValuAPack - self.moduleVgprMacroValuBPack = moduleVgprMacroValuBPack - self.moduleVgprMacroValuM = moduleVgprMacroValuM - self.moduleVgprMacroValuMPack = moduleVgprMacroValuMPack - self.moduleVgprMacroG2LA = moduleVgprMacroG2LA - self.moduleVgprMacroG2LB = moduleVgprMacroG2LB - module.addComment2("VGPR Macro Assignments") - module.add(self.moduleVgprMacro) - module.add(self.moduleVgprMacroValuA) - module.add(self.moduleVgprMacroValuA_T) - module.add(self.moduleVgprMacroValuB) - module.add(self.moduleVgprMacroValuB_T) - module.add(self.moduleVgprMacroValuAPack) - module.add(self.moduleVgprMacroValuBPack) - module.add(self.moduleVgprMacroValuM) - module.add(self.moduleVgprMacroValuMPack) - module.add(self.moduleVgprMacroG2LA) - module.add(self.moduleVgprMacroG2LB) + if kernel["ProblemType"]["OutputAmaxD"]: + module.add(RegSet("v", "vgprAmaxOut", self.startVgprAmaxOut)) + module.add(RegSet("v", "vgprAmaxOutB", self.startVgprAmaxOutB)) + + if kernel["ProblemType"]["DataType"].isDoubleComplex() and kernel["MIArchVgpr"]: + module.add(RegSet("v", "vgprAlphaTmp", \ + self.states.startVgprAlphaTmp)) + + module.add(RegSet("v", "vgprSerial", self.states.startVgprSerial)) + + if self.debugConfig.debugKernel: + module.add(RegSet("v", "vgprAddressDbg", \ + self.states.startVgprAddressDbg)) + #module.addComment0("Occu: %u waves/simd" % self.numWavesPerSimd ) + # module.addComment0("Num VGPR=%u"%self.vgprPool.size()) + # module.addComment0("Num AccVGPR=%u"%self.agprPool.size()) + self.moduleVgprMacro = moduleVgprMacro + self.moduleVgprMacroValuA = moduleVgprMacroValuA + self.moduleVgprMacroValuB = moduleVgprMacroValuB + self.moduleVgprMacroValuAPack = moduleVgprMacroValuAPack + self.moduleVgprMacroValuBPack = moduleVgprMacroValuBPack + self.moduleVgprMacroValuM = moduleVgprMacroValuM + self.moduleVgprMacroValuMPack = moduleVgprMacroValuMPack + self.moduleVgprMacroG2LA = moduleVgprMacroG2LA + self.moduleVgprMacroG2LB = moduleVgprMacroG2LB + module.addComment2("VGPR Macro Assignments") + module.add(self.moduleVgprMacro) + module.add(self.moduleVgprMacroValuA) + module.add(self.moduleVgprMacroValuA_T) + module.add(self.moduleVgprMacroValuB) + module.add(self.moduleVgprMacroValuB_T) + module.add(self.moduleVgprMacroValuAPack) + module.add(self.moduleVgprMacroValuBPack) + module.add(self.moduleVgprMacroValuM) + module.add(self.moduleVgprMacroValuMPack) + module.add(self.moduleVgprMacroG2LA) + module.add(self.moduleVgprMacroG2LB) + + def macroAndSetVgprImplSubtile(): + module.add(RegSet("v", "vgprSerial", self.states.startVgprSerial)) + #self.vgprPool.remove(self.states.startVgprSerial, 1) + #module.addComment0("Need %u vgprs for GR A"%(self.states.a.tileInfo.numGRPerSubtile)) + return + if not kernel["UseSubtileImpl"]: + macroAndSetImplClassic() + else: + macroAndSetVgprImplSubtile() + ######################################## # SGPR Macros ######################################## @@ -1509,7 +1536,8 @@ def macroAndSet(self, kernel, tPA, tPB) -> Module: for (tc, indices, justOffset32, tP, isSwizzled) in GOList: # BufferStore does not use this macro so don't generate it: - if tc == "C" and kernel["BufferStore"]: + # Subtile impl does not use these macros either + if (tc == "C" and kernel["BufferStore"]) or kernel["UseSubtileImpl"]: continue # function name and comment @@ -1814,6 +1842,8 @@ def checkResources(self, kernel, mkb: KernelBody): msg = "SIA2 better with occupancy 2" elif self.states.overflowedResources == 7: msg = "invalid LSU code due to assertion fail" + elif self.states.overflowedResources == 8: + msg = "not enough LDS space" else: msg = "unknown" @@ -2221,20 +2251,17 @@ def defineAndResources(self, kernel, tPA, tPB, tPM): # B address interleave (restricted) - compute runtime G once and reuse later. if kernel["BAddrInterleave"]: moduleRegInit.addComment1("Interleave: define SGPR and init runtime G once") - sgprG = self.defineSgprIdx("BInterleaveG", 1) + self.removeSgprVarFromPool("BInterleaveG") if "BInterleaveG" not in self.states.nonPostLoopSgpr: self.states.nonPostLoopSgpr.append("BInterleaveG") - moduleRegInit.add(RegSet("s", "sgprBInterleaveG", sgprG)) moduleRegInit.addModuleAsFlatItems(self.initBInterleaveG(kernel)) # K ring-shift (restricted) - compute per-WG shift once and reuse later. if kernel["KRingShift"]: moduleRegInit.addComment1("KRS: KRingShift define SGPR and init per-WG shift once") - sgprShift = self.defineSgprIdx("KRingShift", 1) - # Keep this SGPR live into post-loop (tail + store) - prevent endSummation from undefining it. + self.removeSgprVarFromPool("KRingShift") if "KRingShift" not in self.states.nonPostLoopSgpr: self.states.nonPostLoopSgpr.append("KRingShift") - moduleRegInit.add(RegSet("s", "sgprKRingShift", sgprShift)) self.sgprPool.checkIn(sgprPackedArgs) @@ -2292,28 +2319,29 @@ def defineAndResources(self, kernel, tPA, tPB, tPM): # C regs are not used during initialization so mark them as available - # we will claim then just before the start of the unroll loop: - if self.states.lastValuMXSAB: - self.vgprPool.add(0 , \ - self.states.lastValuMXSAB, "ValuMXSAB") # Add as available + if not kernel["UseSubtileImpl"]: + if self.states.lastValuMXSAB: + self.vgprPool.add(0 , \ + self.states.lastValuMXSAB, "ValuMXSAB") # Add as available + moduleWg.addComment0("init: add vgpr [%u...%u) to pool" % \ + (self.states.mxsa.startVgprValu, self.states.lastValuMXSAB+self.states.mxsa.startVgprValu)) + + self.vgprPool.add(self.states.a.startVgprValu , \ + self.states.lastValuAB - self.states.a.startVgprValu , "ValuAB") # Add as available moduleWg.addComment0("init: add vgpr [%u...%u) to pool" % \ - (self.states.mxsa.startVgprValu, self.states.lastValuMXSAB+self.states.mxsa.startVgprValu)) + (self.states.a.startVgprValu, self.states.lastValuAB+self.states.a.startVgprValu)) - self.vgprPool.add(self.states.a.startVgprValu , \ - self.states.lastValuAB - self.states.a.startVgprValu , "ValuAB") # Add as available - moduleWg.addComment0("init: add vgpr [%u...%u) to pool" % \ - (self.states.a.startVgprValu, self.states.lastValuAB+self.states.a.startVgprValu)) - - self.vgprPool.add(self.states.c.startVgprValu, \ - self.states.c.numVgprValu, "ValuC-Block") # Add as available - moduleWg.addComment0("init: add vgpr [%u...%u) to pool" % \ - (self.states.c.startVgprValu, self.states.c.startVgprValu+self.states.c.numVgprValu)) + self.vgprPool.add(self.states.c.startVgprValu, \ + self.states.c.numVgprValu, "ValuC-Block") # Add as available + moduleWg.addComment0("init: add vgpr [%u...%u) to pool" % \ + (self.states.c.startVgprValu, self.states.c.startVgprValu+self.states.c.numVgprValu)) - numAccvgprs = self.states.totalAgprs - self.agprPool.add(0, numAccvgprs, "ValuC-Block") - moduleWg.addComment0("init: add agpr [%u...%u) to pool" % \ - (0, numAccvgprs)) + numAccvgprs = self.states.totalAgprs + self.agprPool.add(0, numAccvgprs, "ValuC-Block") + moduleWg.addComment0("init: add agpr [%u...%u) to pool" % \ + (0, numAccvgprs)) - if kernel["StreamK"] == 0: + if kernel["StreamK"] == 0 and not kernel["UseSubtileImpl"]: moduleWg.add(self.localReadAddresses(kernel, tPA, tPB, tPM)) moduleWg.add(self.localWriteAddresses(kernel, tPA, tPB, tPM)) @@ -2631,13 +2659,13 @@ def calculateWG(): module.add(SMovB32(dst=sgpr("PackKFor%sV3"%tPackM), src="0x0c0c0703", comment="")) # self.states.groOffsetInMacroTile == 1 case, subtract pre-pad here - if self.states.groOffsetInMacroTile: + if self.states.groOffsetInMacroTile and not kernel["UseSubtileImpl"]: # Added logic to check for Pointer Array case (ArgType==3) and not prepad the double pointer addresses Skip_Address_Prepad_For_Pointer_Array = Label(label="Skip_Address_Prepad_For_Pointer_Array", comment="Skip pre-padding of address for pointer array case") if kernel["ProblemType"]["SupportUserArgs"]: module.add(SCmpEQU32(src0=sgpr("ArgType"), src1=3, comment="ArgType == 3 for General Batched GEMM")) - module.add(SCBranchSCC1(labelName=Skip_Address_Prepad_For_Pointer_Array.getLabelName())) - if not kernel["enableTDMA"]: + module.add(SCBranchSCC1(labelName=Skip_Address_Prepad_For_Pointer_Array.getLabelName())) + if not kernel["enableTDMA"]: prePad = int(self.states.srdShiftLeft["A"] * tPA["bpeGR"]) # leave room in case we have to pointer shift module.add(SSubU32(dst=sgpr("AddressA+0"), src0=sgpr("AddressA+0"), src1=prePad, comment="pre-pad to make room for possible pointer shift")) module.add(SSubBU32(dst=sgpr("AddressA+1"), src0=sgpr("AddressA+1"), src1=0, comment="pre-pad to make room for possible pointer shift")) @@ -4140,13 +4168,39 @@ def computeLoadSrd(self, kernel, tP, tc, indices, bpe): moduleLoadGeneralBatch = Module("computeLoadSrd-GeneralBatch") moduleLoadStridedBatch = Module("computeLoadSrd-StridedBatch") use64bShadowLimit = self.states.use64bShadowLimitMX if tc in ["MXSA", "MXSB"] else self.states.use64bShadowLimit - with self.allocTmpSgpr(2 + 2 + (0 if use64bShadowLimit else 2)) as tmpSgprInfo: + isgfx950 = kernel["ISA"][:2] == (9, 5) + isgfx950mx = isgfx950 and ("MXS" in tc) + # UseSubtileImpl uses a tile-boundary fixed Srd+2 for both MX scale and data A/B. + # This avoids 32-bit overflow when computing the full tensor2dSize (N*K or M*K > 2^32). + useSubtile = bool(kernel.get("UseSubtileImpl")) + useFixedSrd2 = useSubtile + isPreShuffledAB = tc in ("A", "B") and kernel["ProblemType"].get("SwizzleTensor%s" % tc, False) + isSwizzledSubtile = (isgfx950mx or isPreShuffledAB) and useSubtile + if isgfx950mx: + useFixedSrd2 = True + tcab = "A" if tc == "MXSA" else "B" + mxBlock = kernel["ProblemType"]["MXBlock%s"%tcab] + swizzleSize0 = 32 # M,N direction + swizzleSize1 = 256 # K direction + swizzleBlockSize = swizzleSize0 * swizzleSize1 // mxBlock + else: + if isSwizzledSubtile: + swizzleSize0 = 16 # M,N direction + swizzleSize1 = 32 # K direction for MXFP4 (TODO: use bpe to support different dataTypes) + else: + swizzleSize0 = 1 # M,N direction + swizzleSize1 = 1 # K direction + swizzleBlockSize = swizzleSize0 * swizzleSize1 + + allocateTensor2dSize = use64bShadowLimit and not useFixedSrd2 + numDim = len(indices) + with self.allocTmpSgpr(2 + 2 + (0 if allocateTensor2dSize else 2)) as tmpSgprInfo: stmp = tmpSgprInfo.idx tileStart = stmp+2 if use64bShadowLimit: tensor2dSize0 = "ShadowLimit%s+0"%tc tensor2dSize1 = "ShadowLimit%s+1"%tc - else: + elif not useFixedSrd2: tensor2dSize0 = stmp+4 tensor2dSize1 = stmp+5 wroteTileStart = False @@ -4162,7 +4216,12 @@ def computeLoadSrd(self, kernel, tP, tc, indices, bpe): #tP['ia'][1] # This is guaranteed to fit in 32-bit since the WG*MT is a number of elements in some unsigned direction: - module.addModuleAsFlatItems(self.s_mul_u64_u32(sgpr(tileStart+0), sgpr(tileStart+1), sgpr(tP["wg"]), kernel[tP["mt"]], comment="WorkGroup[01] * MT")) + if useFixedSrd2: + # UseSubtileImpl fixedSrd2 case (including swizzle and nonSwizzle): tile start uses roundup(MT/swizzleSize0) + mt = roundUp(kernel[tP["mt"]] / swizzleSize0) + module.addModuleAsFlatItems(self.s_mul_u64_u32(sgpr(tileStart+0), sgpr(tileStart+1), sgpr(tP["wg"]), mt, comment="WorkGroup[01] * roundup(MT/%u)"%swizzleSize0)) + else: + module.addModuleAsFlatItems(self.s_mul_u64_u32(sgpr(tileStart+0), sgpr(tileStart+1), sgpr(tP["wg"]), kernel[tP["mt"]], comment="WorkGroup[01] * MT")) # Interleave (restricted): for B (tlu==False), overwrite wg1*MT1 with baseCol: # baseCol = (wg1/G)*(MT1*G) + (wg1%G), G=min(lowbit(SizeJ/MT1), LVCB) @@ -4209,6 +4268,40 @@ def computeLoadSrd(self, kernel, tP, tc, indices, bpe): module.add(SMovB32(dst=sgpr(tileStart+1), src=0)) strideF = self.strideRef(tc, tP['tileIdx']) if not self.isConstUnitStride(strideF): + if useFixedSrd2: + # Tile-boundary SRD+2 for UseSubtileImpl (unified for MX scale and data A/B). + # Avoids 32-bit overflow from computing full tensor2dSize when N*K or M*K > 2^32. + # + # tileStart is in block units (roundUp(MT/swizzleSize0)). + # numLine = min(roundUp(size/swizzleSize0) - tileStart_blk, roundUp(MT/swizzleSize0)) - 1 + # Srd+2 = numLine * stride_bytes + swizzleBlockSize*(DepthU/swizzleSize1) + # + # Key: numLine/numElems <= MT (compile-time), so the multiply stays in 32 bits. + mt_units = mt # roundUp(MT/swizzleSize0), compile-time + extra_bytes = swizzleBlockSize * (kernel["DepthU"] // swizzleSize1) + + for i in range(0, numDim): + idx = indices[i] + if idx == kernel["ProblemType"]["Index0"] or idx == kernel["ProblemType"]["Index1"]: + size = self.sizeRef(idx) + if isSwizzledSubtile: + # tileStart already in block units (WG * roundUp(MT/swizzleSize0)) + module.add(SAddU32(dst=sgpr(stmp+0), src0=size, src1=(swizzleSize0 - 1), comment="size + %u - 1"%swizzleSize0)) + module.add(SLShiftRightB32(dst=sgpr(stmp+0), src=sgpr(stmp+0), shiftHex=log2(swizzleSize0), comment="roundup(size/%u)"%swizzleSize0)) + module.add(SSubU32(dst=sgpr(stmp+0), src0=sgpr(stmp+0), src1=sgpr(tileStart+0), comment="numBlkToEnd = roundUp(size/%u) - tileStart_blk"%swizzleSize0)) + else: + # tileStart in element units (WG * MT); no block rounding needed + module.add(SSubU32(dst=sgpr(stmp+0), src0=size, src1=sgpr(tileStart+0), comment="numToEnd = size - WG*MT")) + module.add(SMinU32(dst=sgpr(stmp+0), src0=sgpr(stmp+0), src1=mt_units, comment="min (numBlkToEnd, roundup(MT/%u))"%swizzleSize0)) + module.add(SSubU32(dst=sgpr(stmp+0), src0=sgpr(stmp+0), src1=1, comment="numLine = min - 1 (0-based index)")) + module.addModuleAsFlatItems(self.s_mul_u64_u32(sgpr(stmp+0), sgpr(stmp+1), sgpr(stmp+0), \ + strideF, comment="numLine * stride")) + if isgfx950mx: + module.add(SAddU32(dst=sgpr("Srd%s+2"%tc), src0=sgpr(stmp+0), src1=extra_bytes, comment="buffer_load limit for %s"%tc)) + else: + # (numLine * stride + DepthU) * bpe -- mirrors scale path structure + module.add(SAddU32(dst=sgpr(stmp+0), src0=sgpr(stmp+0), src1=extra_bytes, comment="+ DepthU (one K step)")) + module.add(scalarMultiplyBpe("Srd%s+2"%tc, stmp+0, float(tP["bpeGR"]), comment="buffer_load limit for %s (tile-boundary, avoids 32-bit overflow)"%tc)) module.addModuleAsFlatItems(self.s_mul_u64_u32(sgpr(tileStart), sgpr(tileStart+1), sgpr(tileStart+0), \ strideF, comment="tlu=0, scaled tile-offset by stride")) @@ -4231,15 +4324,18 @@ def computeLoadSrd(self, kernel, tP, tc, indices, bpe): module.add(SMovB64(dst=sgpr(tileStart, 2), src=0, comment="set default tileStart")) #Calculate tensor 2d size - if use64bShadowLimit or ((not use64bShadowLimit) and tensor2dSize0 % 2 == 0): - module.add(SMovB64(dst=sgpr(tensor2dSize0, 2), src=0x1, comment="Init tensor size")) - else: - module.add(SMovB32(dst=sgpr(tensor2dSize0), src=0x1, comment="Init tensor size")) - module.add(SMovB32(dst=sgpr(tensor2dSize1), src=0x0, comment="init tensor size")) - + # For UseSubtileImpl kernels (MX and non-MX), useFixedSrd2=True so tensor2dSize is not needed. + if not useFixedSrd2: + if use64bShadowLimit or ((not use64bShadowLimit) and tensor2dSize0 % 2 == 0): + module.add(SMovB64(dst=sgpr(tensor2dSize0, 2), src=0x1, comment="Init tensor size")) + else: + module.add(SMovB32(dst=sgpr(tensor2dSize0), src=0x1, comment="Init tensor size")) + module.add(SMovB32(dst=sgpr(tensor2dSize1), src=0x0, comment="init tensor size")) - numDim = len(indices) for i in range(0, numDim): + if useFixedSrd2: + # fixed Srd2 case, skip tensor2dSize0/1 calculation + continue idx = indices[i] if idx == kernel["ProblemType"]["Index0"] \ or idx == kernel["ProblemType"]["Index1"] \ @@ -4255,18 +4351,14 @@ def computeLoadSrd(self, kernel, tP, tc, indices, bpe): divider = 8 if tP["isM"] else 2 module.add(SLShiftRightB32(dst=sgpr(stmp), src=size, shiftHex=hex(int(log(divider,2))), comment="(size/%u)"%divider)) module.add(SSubU32(dst=sgpr(stmp), src0=sgpr(stmp), src1=0x1, comment="(size/%u-1)"%divider)) - elif tc in ("MXSA", "MXSB"): - mxBlock = kernel["ProblemType"]["MXBlockA"] if tc == "MXSA" else kernel["ProblemType"]["MXBlockB"] - if mxBlock > 0: - if kernel["AssertSummationElementMultiple"] % mxBlock != 0: - module.add(SAddU32(dst=sgpr(stmp), src0=size, src1=(mxBlock-1), comment="(size/%d-1)" %mxBlock)) - src0 = sgpr(stmp) - else: - src0 = size - module.add(SLShiftRightB32(dst=sgpr(stmp), src=src0, shiftHex=log2(mxBlock), comment="(size/%d-1)" %mxBlock)) - module.add(SSubU32(dst=sgpr(stmp), src0=sgpr(stmp), src1=0x1, comment="(size/%d-1)" %mxBlock)) - else: - module.add(SSubU32(dst=sgpr(stmp), src0=size, src1=0x1, comment="(size-1)")) + elif tc == "MXSA": + mxBlock = kernel["ProblemType"]["MXBlockA"] + module.add(SLShiftRightB32(dst=sgpr(stmp), src=size, shiftHex=log2(mxBlock), comment="(size/%d-1)" %mxBlock)) + module.add(SSubU32(dst=sgpr(stmp), src0=sgpr(stmp), src1=0x1, comment="(size/%d-1)" %mxBlock)) + elif tc == "MXSB": + mxBlock = kernel["ProblemType"]["MXBlockB"] + module.add(SLShiftRightB32(dst=sgpr(stmp), src=size, shiftHex=log2(mxBlock), comment="(size/%d-1)" %mxBlock)) + module.add(SSubU32(dst=sgpr(stmp), src0=sgpr(stmp), src1=0x1, comment="(size/%d-1)" %mxBlock)) elif tP["isSwizzled"]: module.addModuleAsFlatItems(self.alignTo(stmp, "SizeL", tP["swizzleK"])) module.add(SSubU32(dst=sgpr(stmp), src0=sgpr(stmp), src1=1, comment="SWZ-%s align: (sizeL-1)"%tc)) @@ -4289,38 +4381,41 @@ def computeLoadSrd(self, kernel, tP, tc, indices, bpe): module.add(SAddU32(dst=sgpr(tensor2dSize0), src0=sgpr(tensor2dSize0), src1=sgpr(stmp+0), comment="sum tensor size")) module.add(SAddCU32(dst=sgpr(tensor2dSize1), src0=sgpr(tensor2dSize1), src1=sgpr(stmp+1), comment="sum tensor size")) - if use64bShadowLimit: - limitTmp0 = "ShadowLimit%s+0"%tc - limitTmp1 = "ShadowLimit%s+1"%tc - else: - limitTmp0 = stmp+0 - limitTmp1 = stmp+1 - - module.add(SSubU32(dst=sgpr(limitTmp0), src0=sgpr(tensor2dSize0), src1=sgpr(tileStart+0), comment="sub tileStart")) - module.add(SSubBU32(dst=sgpr(limitTmp1), src0=sgpr(tensor2dSize1), src1=sgpr(tileStart+1), comment="sub tileStart")) - - if use64bShadowLimit: - # Set initial buffer limit - # if the limit is >64bit, incrementSrd decrements the shadow as the SRD increments, - # and when we get within 32-bit we start to step down the SRD - # if the limit is <32bits, set it accurately here: - # Note lshl_b64 the higher-numbered SGPR has the upper 32-bits - module.add(scalarMultiply64Bpe("ShadowLimit%s"%tc, "ShadowLimit%s"%tc, tP["bpeGR"], stmp, "Set limit to use bytes")) - if prePad: - module.add(SAddU32(dst=sgpr("ShadowLimit%s+0"%tc), src0=sgpr("ShadowLimit%s+0"%tc), src1=prePad, comment="extend limit for pre-pad")) - module.add(SAddCU32(dst=sgpr("ShadowLimit%s+1"%tc), src0=sgpr("ShadowLimit%s+1"%tc), src1=0, comment="extend limit for pre-pad")) - - if kernel["DirectToLds%s"%tc] and kernel["UseInstOffsetForGRO"]: - module.add(SAddU32(dst=sgpr("ShadowLimit%s+0"%tc), src0=sgpr("ShadowLimit%s+0"%tc), src1=self.buff_load_inst_offset_max, comment="extend limit for directToLDS instruction offset")) - module.add(SAddCU32(dst=sgpr("ShadowLimit%s+1"%tc), src0=sgpr("ShadowLimit%s+1"%tc), src1=0, comment="extend limit for directToLDS instruction offset")) - - module.add(SCmpEQU32(src0=sgpr("ShadowLimit%s+1"%tc), src1=0, comment="are we within 2^32?")) - module.add(SCSelectB32(dst=sgpr("Srd%s+2"%tc), src0=sgpr("ShadowLimit%s+0"%tc), src1="BufferLimit", comment="Move shadow to real if we are within 2^32")) - module.add(self.shiftSrd(tc)) - else: - # put limit directly into SRD: - module.add(scalarMultiplyBpe("Srd%s+2"%tc, stmp, float(tP["bpeGR"]), comment="Set limit to use bytes")) - module.add(SAddU32(dst=sgpr("Srd%s+2"%tc), src0=sgpr("Srd%s+2"%tc), src1=prePad, comment="extend limit for pre-pad")) + # skip ShadowLimit and Srd+2 calculation here in useFixedSrd2 case + if not useFixedSrd2: + if use64bShadowLimit: + limitTmp0 = "ShadowLimit%s+0"%tc + limitTmp1 = "ShadowLimit%s+1"%tc + else: + limitTmp0 = stmp+0 + limitTmp1 = stmp+1 + + module.add(SSubU32(dst=sgpr(limitTmp0), src0=sgpr(tensor2dSize0), src1=sgpr(tileStart+0), comment="sub tileStart")) + module.add(SSubBU32(dst=sgpr(limitTmp1), src0=sgpr(tensor2dSize1), src1=sgpr(tileStart+1), comment="sub tileStart")) + + if use64bShadowLimit: + # Set initial buffer limit + # if the limit is >64bit, incrementSrd decrements the shadow as the SRD increments, + # and when we get within 32-bit we start to step down the SRD + # if the limit is <32bits, set it accurately here: + # Note lshl_b64 the higher-numbered SGPR has the upper 32-bits + module.add(scalarMultiply64Bpe("ShadowLimit%s"%tc, "ShadowLimit%s"%tc, tP["bpeGR"], stmp, "Set limit to use bytes")) + if prePad: + module.add(SAddU32(dst=sgpr("ShadowLimit%s+0"%tc), src0=sgpr("ShadowLimit%s+0"%tc), src1=prePad, comment="extend limit for pre-pad")) + module.add(SAddCU32(dst=sgpr("ShadowLimit%s+1"%tc), src0=sgpr("ShadowLimit%s+1"%tc), src1=0, comment="extend limit for pre-pad")) + + if kernel["DirectToLds%s"%tc] and kernel["UseInstOffsetForGRO"]: + module.add(SAddU32(dst=sgpr("ShadowLimit%s+0"%tc), src0=sgpr("ShadowLimit%s+0"%tc), src1=self.buff_load_inst_offset_max, comment="extend limit for directToLDS instruction offset")) + module.add(SAddCU32(dst=sgpr("ShadowLimit%s+1"%tc), src0=sgpr("ShadowLimit%s+1"%tc), src1=0, comment="extend limit for directToLDS instruction offset")) + + module.add(SCmpEQU32(src0=sgpr("ShadowLimit%s+1"%tc), src1=0, comment="are we within 2^32?")) + module.add(SCSelectB32(dst=sgpr("Srd%s+2"%tc), src0=sgpr("ShadowLimit%s+0"%tc), src1="BufferLimit", comment="Move shadow to real if we are within 2^32")) + module.add(self.shiftSrd(tc)) + else: + # put limit directly into SRD: + module.add(scalarMultiplyBpe("Srd%s+2"%tc, stmp, float(tP["bpeGR"]), comment="Set limit to use bytes")) + if prePad: + module.add(SAddU32(dst=sgpr("Srd%s+2"%tc), src0=sgpr("Srd%s+2"%tc), src1=prePad, comment="extend limit for pre-pad")) # Apply any high-order address components to the tileStart and eventually the SRD - batch idx for batched gemm wg=2 # TODO - refactor since only WG2 is supported and this is always batch @@ -7436,10 +7531,25 @@ def fixPreloadOffset(offset, sgpxIdxVec, numStoreSgprToLoad): #instCycles = kernel["MatrixInstM"] // 2 # 32x32 is 64 cycles, 16x16 is 32 cycles, 4x4 is 8 cycles #module.add(SNop(waitState=instCycles)) module.addComment1("Mapping of Acc register -> C Vgpr register") - self.codes.accVgprRead = mapAcctoArchRegs(kernel, self.states.maxLimitAgprs, write=False) + # For subtile kernels with mixed agpr/vgpr accumulators the spilled + # D-tile values live in arch vgprs allocated from the pool (not at + # ValuC+N). Determine their base vgpr so mapAcctoArchRegs can address + # them correctly. + spilledVgprBase = None + if kernel.get("UseSubtileImpl"): + # For subtile kernels, D-tile accumulators that overflow the accvgpr + # pool are placed in arch vgprs allocated from the vgpr pool. + # mapAcctoArchRegs needs to know the base address of those vgprs so it + # can emit correct moves instead of referencing "ValuC+N" (which points + # to the wrong location in the subtile allocation scheme). + for vtile in self.states.d.tileInfo.vgprTiles: + if vtile.regList.regPool == self.vgprPool: + spilledVgprBase = vtile.regList.regValues[0] + break + self.codes.accVgprRead = mapAcctoArchRegs(kernel, self.states.maxLimitAgprs, write=False, spilledVgprBase=spilledVgprBase) if (kernel["StreamK"] > 0 and kernel["StreamKAtomic"] == 0) or \ ((kernel["GlobalSplitU"] == -1 or kernel["GlobalSplitU"] > 0) and (kernel["GlobalSplitUAlgorithm"] == "MultipleBufferSingleKernel" or kernel["AdaptiveGemmGSUA"] == 1)): - self.codes.accVgprWrite = mapAcctoArchRegs(kernel, self.states.maxLimitAgprs, write=True) + self.codes.accVgprWrite = mapAcctoArchRegs(kernel, self.states.maxLimitAgprs, write=True, spilledVgprBase=spilledVgprBase) # same spilledVgprBase if kernel["MIArchVgpr"]: module.addComment1("Multiply MI out register with Alpha -> C Vgpr register") self.codes.mulAlphaMultipleBuffer = moveMIoutToArch(kernel, self.states.startVgprAlphaTmp) @@ -8953,7 +9063,8 @@ def closeSumAtLeastUnroll(self, kernel, tPA, tPB, prefetch, isOptNLL, isNGLL, is (fullVws, elements, fullVws_1, elements_1) = self.notLocalFullTileElements(kernel) alpha = False beta = False - module.add(self.globalWriteElements(kernel, tPA, tPB, [fullVws[0]], [fullVws_1[0]], [elements[0]], [elements_1[0]], True, applyAlpha=alpha, betas=[beta], edge=False)) + storeModule, _ = self.globalWriteElements(kernel, tPA, tPB, [fullVws[0]], [fullVws_1[0]], [elements[0]], [elements_1[0]], True, applyAlpha=alpha, betas=[beta], edge=False) + module.add(storeModule) self.cleanupGlobalWrite(kernel) module.addSpaceLine() @@ -9136,22 +9247,6 @@ def globalReadIncrement(self, kernel, imod, loopIdx, tP, prefetchIndex): comment="incUpper <- ?")) imod.addModuleAsFlatItems(self.incrementSrd(tP, sgpr(incLower), sgpr(incUpper))) - if "MX" in tP: - # TODO: DirectToVgpr - tc = tP["MX"]["tensorChar"] - imod.addComment1("global read inc %s loop%s"%(tc, loopChar)) - if prefetchIndex: - imod.add(SAddU32(dst=sgpr(tmpS), src0=self.loopCounter(kernel, self.states.unrollIdx), src1=prefetchIndex, comment="remove pf(%u)"%prefetchIndex)) - imod.add(SCmpEQU32(src0=sgpr(suStr), src1=sgpr(tmpS), comment="Is this wrapIter? (pf)")) - else: - imod.add(SCmpEQU32(src0=self.loopCounter(kernel, self.states.unrollIdx), \ - src1=sgpr(suStr), comment="Is this the wrapIter?")) - imod.add(SCSelectB32(dst=sgpr(incLower), src0=sgpr("WrapU%s+0"%tc), src1=sgpr("GlobalReadIncs%s+%u"%(tc,self.states.unrollIdx)), \ - comment="incLower <- ?")) - imod.add(SCSelectB32(dst=sgpr(incUpper), src0=sgpr("WrapU%s+1"%tc), src1=0, - comment="incUpper <- ?")) - imod.addModuleAsFlatItems(self.incrementSrd(tP["MX"], sgpr(incLower), sgpr(incUpper))) - if kernel["ProblemType"]["Sparse"]: if (kernel["ProblemType"]["Sparse"] == 2 and tP["isB"]) or (kernel["ProblemType"]["Sparse"] == 1 and tP["isA"]) : tc = "Metadata" @@ -9190,23 +9285,6 @@ def globalReadIncrement(self, kernel, imod, loopIdx, tP, prefetchIndex): srcGRInc = "GlobalReadIncs%s"%tc imod.addModuleAsFlatItems(self.incrementSrd(tP, srcGRInc, hex(incUpper))) - if "MX" in tP: - tc = tP["MX"]["tensorChar"] - imod.addComment1("global read inc %s loop%s"%(tc, loopChar)) - if loopIdx != self.states.unrollIdx or (tc in ('MXSA', 'MXSB') and kernel["ProblemType"]["IndicesSummation"][self.states.unrollIdx] in kernel["ProblemType"]["MirrorDims%s"%tc]): - with self.allocTmpSgpr(1) as tmpSgprInfo: - incUpper = tmpSgprInfo.idx - # GRO may be negative for other summation if stride-other < stride-unroll or if mirror dim. - imod.add(SAShiftRightI32(dst=sgpr(incUpper), shiftHex=31, src=sgpr("GlobalReadIncs%s+%u"%(tc,loopIdx)), comment="sign-extend")) - imod.addModuleAsFlatItems(self.incrementSrd(tP["MX"], sgpr("GlobalReadIncs%s+%u"%(tc,loopIdx)), sgpr(incUpper))) - else: - incUpper = 0 # GRO is positive for loop unroll - srcGRInc = sgpr("GlobalReadIncs%s+%u"%(tc,loopIdx)) - useConstSgprGlobalReadIncs = self.states.mxsa.useConstSgprGlobalReadIncs if tc == 'MXSA' else self.states.mxsb.useConstSgprGlobalReadIncs - if useConstSgprGlobalReadIncs: - srcGRInc = "GlobalReadIncs%s"%tc - imod.addModuleAsFlatItems(self.incrementSrd(tP["MX"], srcGRInc, hex(incUpper))) - if kernel["ProblemType"]["Sparse"]: if (kernel["ProblemType"]["Sparse"] == 2 and tP["isB"]) or (kernel["ProblemType"]["Sparse"] == 1 and tP["isA"]) : tc = "Metadata" @@ -9291,12 +9369,16 @@ def globalReadIncrementAB(self, kernel, tPA, tPB, loopIdx, prefetchIndex): self.globalReadIncrement(kernel, incCodeA, loopIdx, tPA, prefetchIndex) else: incCodeA.add(self.tdmIncrementAB(kernel, tPA)) + if "MX" in tPA and not tdmA: + self.globalReadIncrement(kernel, incCodeA, loopIdx, tPA["MX"], prefetchIndex) incCodeB = imod.add(Module("globalReadIncrementB")) if tPB != None: if not tdmB: self.globalReadIncrement(kernel, incCodeB, loopIdx, tPB, prefetchIndex) else: incCodeB.add(self.tdmIncrementAB(kernel, tPB)) + if "MX" in tPB and not tdmB: + self.globalReadIncrement(kernel, incCodeB, loopIdx, tPB["MX"], prefetchIndex) return imod ############################################################################## @@ -12187,6 +12269,19 @@ def computeStoreSrdStart(self, kernel, srdTcList: list, sgprBpeList = [], useSiz else: useSize = [False for _ in srdTcList] + # For subtile StreamK kernels (StreamK==3, no atomic), the SGPR pool is exhausted + # after endSummation. Temporarily expose SrdWS (s60-s63) as Available scratch so + # that allocTmpSgpr calls within this function (and the SK component call below) + # can borrow those slots. Restore SrdWS as InUse at the end. + srdWsAvailableCtx = ( + kernel.get("StreamK", 0) == 3 + and kernel.get("StreamKAtomic", 1) == 0 + and "SrdWS" in self.sgprs + and "SrdWS" not in self.states.freeSgprVarPool + ) + if srdWsAvailableCtx: + self.addSgprVarToPool("SrdWS") + # Keep tmp SGPR usage lean for the common path (same as develop). # BAddrInterleave needs additional temporaries for baseCol computation; allocate # those *only when enabled* so marginal kernels don't overflow MaxSgpr. @@ -12371,6 +12466,8 @@ def computeStoreSrdStart(self, kernel, srdTcList: list, sgprBpeList = [], useSiz addrSrcSgpr = "Srd" # update src Sgpr for the second or later iterations if noMultipleBuffer: + if srdWsAvailableCtx: + self.removeSgprVarFromPool("SrdWS") return module gsuComponent = Component.GSU.find(self) @@ -12391,6 +12488,9 @@ def computeStoreSrdStart(self, kernel, srdTcList: list, sgprBpeList = [], useSiz module.add(SMulI32(dst=sgpr(packedSizes), src0=sgpr(packedSizes), \ src1=self.sizeRef(idx), comment="first packed size")) + if srdWsAvailableCtx: + self.removeSgprVarFromPool("SrdWS") + return module ############################################################################## @@ -13225,17 +13325,18 @@ def storeRemapComputeStoreVgprs(self, kernel): # then calls globalWriteElements to generate the code for the new tiles. ############################################################################## def notLocalSplitUGlobalWrite(self, kernel, tPA, tPB): - if not self.do["PostLoop"]: return "" + if not self.do["PostLoop"]: return Module("notLocalSplitUGlobalWrite"), None (fullVws, elements, fullVws_1, elements_1) = self.notLocalFullTileElements(kernel) # print("len(elements)= ", len(elements_1)) noGSUBranch = (kernel["GlobalSplitU"] == 0 and kernel["StreamK"] != 3) module = Module("notLocalSplitUGlobalWrite") - module.add(self.globalWriteElements(kernel, tPA, tPB, fullVws, fullVws_1, elements, elements_1, noGSUBranch=noGSUBranch)) + storeModule, deferredGSU0 = self.globalWriteElements(kernel, tPA, tPB, fullVws, fullVws_1, elements, elements_1, noGSUBranch=noGSUBranch) + module.add(storeModule) self.cleanupGlobalWrite(kernel) - return module + return module, deferredGSU0 ############################################################################## # LocalSplitU: Global Write @@ -13277,7 +13378,8 @@ def localSplitUGlobalWrite(self, kernel, tPA, tPB): noGSUBranch = (kernel["GlobalSplitU"] == 0 and kernel["StreamK"] != 3) module = Module("localSplitUGlobalWrite") - module.add(self.globalWriteElements(kernel, tPA, tPB, vectorWidths, vectorWidths_1, elements_f0, elements_f1, noGSUBranch=noGSUBranch)) + storeModule, _ = self.globalWriteElements(kernel, tPA, tPB, vectorWidths, vectorWidths_1, elements_f0, elements_f1, noGSUBranch=noGSUBranch) + module.add(storeModule) self.cleanupGlobalWrite(kernel) self.vgprPool.checkIn(self.accVgprLdsReduction) return module @@ -13329,6 +13431,11 @@ def generateBetaModules(self, kernel, tPA, tPB, activation, applyAlpha, betas, e isInsertActFunctionCallAddrCalc, toActModuleList, writeLabels, endLabel, vectorDataTypes, factorDims, globalWriteMode, hasMultipleGlobalWriteModes): betaModules = Module("Betas") + # Base deferral condition — per-factorDim bias check is applied below. + # ScaleAlphaVec has similar LDS barriers that block deferral unconditionally. + allowDeferBase = ( + kernel.get("UseSubtileImpl") + ) currentInstLength = 0 for betaIdx in reversed(range(len(betas))): beta = betas[betaIdx] @@ -13347,15 +13454,58 @@ def generateBetaModules(self, kernel, tPA, tPB, activation, applyAlpha, betas, e else: continue # edge module + # B0 FD0 edge paths are barrier-free — safe to defer. + # FD0 edge paths are safe to defer (edge check is workgroup-uniform, + # all waves take the same path so all waves hit the same barrier). + isMultipleBuffer = kernel["_GlobalAccumulation"] in ("MultipleBufferSingleKernel", "MultipleBuffer") + deferEdge = ( + edge + and allowDeferBase + and ( + factorDim == 0 # FD0: safe to defer (workgroup-uniform edge check) + or (not isMultipleBuffer and self.states.useBias == DataDirection.NONE) # no bias: safe + ) + ) edgeModule = Module("Edge_B%u_FD%u_VW%u" % (beta, factorDim, vectorWidth)) - currentInstLength, activationTypeStr = \ - self.globalWriteElementBatch(kernel, tPA, tPB, activation, - applyAlpha, beta, edge, atomic, - vectorWidth, element, activationLabelList, - tmpVgpr, cvtVgprStruct, activationSetPCStruct, activationEnumStrList, - actPCMaxTempSgpr, isInsertActFunctionCallAddrCalc, toActModuleList, - edgeModule, writeLabels[beta][factorDim][vectorWidth][globalWriteMode]["Then"], endLabel, - currentInstLength, betaIdx, fdIdx, vectorDataTypes, factorDims, hasMultipleGlobalWriteModes) + if deferEdge: + # Generate Edge store into a deferred module + edgeDeferredModule = Module("Edge_B%u_FD%u_VW%u_DeferredBlock" % (beta, factorDim, vectorWidth)) + # Use ThenDeferredReturn as endLabel so the batch jumps back to inline directly + currentInstLength, activationTypeStr = \ + self.globalWriteElementBatch(kernel, tPA, tPB, activation, + applyAlpha, beta, edge, atomic, + vectorWidth, element, activationLabelList, + tmpVgpr, cvtVgprStruct, activationSetPCStruct, activationEnumStrList, + actPCMaxTempSgpr, isInsertActFunctionCallAddrCalc, toActModuleList, + edgeDeferredModule, writeLabels[beta][factorDim][vectorWidth]["ThenDeferred"], + writeLabels[beta][factorDim][vectorWidth]["ThenDeferredReturn"], + currentInstLength, betaIdx, fdIdx, vectorDataTypes, factorDims) + if not hasattr(self.states, 'deferredEdgeModules'): + self.states.deferredEdgeModules = [] + self.states.deferredEdgeModules.append(edgeDeferredModule) + # Inline stub: keep "Then" label, jump to deferred, return + jump to GW_End + edgeModule.add(writeLabels[beta][factorDim][vectorWidth][globalWriteMode]["Then"]) + with self.allocTmpSgpr(3) as tmpSgprInfo: + posLabel = self.labels.getNameInc("ThenDeferredDir") + edgeModule.add(SLongBranch(writeLabels[beta][factorDim][vectorWidth]["ThenDeferred"], tmpSgprInfo, posLabel, comment="edge store (deferred)")) + edgeModule.addComment0("=" * 60) + edgeModule.addComment0(" Edge store B%u FD%u VW%u deferred to after persistent loop" % (beta, factorDim, vectorWidth)) + edgeModule.addComment0(" (would have been inline here in non-deferred version)") + edgeModule.addComment0("=" * 60) + edgeModule.add(writeLabels[beta][factorDim][vectorWidth]["ThenDeferredReturn"]) + with self.allocTmpSgpr(2, alignment=2) as tmpPair: + with self.allocTmpSgpr(1) as tmpOff: + posLabel = self.labels.getNameInc("ThenDeferredReturnDir") + edgeModule.add(SLongBranch(endLabel, tmpPair, tmpOff, posLabel, comment="jump to end")) + else: + currentInstLength, activationTypeStr = \ + self.globalWriteElementBatch(kernel, tPA, tPB, activation, + applyAlpha, beta, edge, atomic, + vectorWidth, element, activationLabelList, + tmpVgpr, cvtVgprStruct, activationSetPCStruct, activationEnumStrList, + actPCMaxTempSgpr, isInsertActFunctionCallAddrCalc, toActModuleList, + edgeModule, writeLabels[beta][factorDim][vectorWidth][globalWriteMode]["Then"], endLabel, + currentInstLength, betaIdx, fdIdx, vectorDataTypes, factorDims) # Edge conditions and branches if edge == True: # Else label @@ -13376,32 +13526,87 @@ def generateBetaModules(self, kernel, tPA, tPB, activation, applyAlpha, betas, e edgeModule.add(writeLabels[beta][factorDim][vectorWidth][globalWriteMode]["NonEdgeEnd"], pos=0) currentInstLength += 1 # Non edge module - nonEdgeModule = Module("Non_Edge_B%u_FD%u_VW%u" % (beta, factorDim, vectorWidth)) - currentInstLength, activationTypeStr = \ - self.globalWriteElementBatch(kernel, tPA, tPB, activation, - applyAlpha, beta, False, atomic, - vectorWidth, element, activationLabelList, - tmpVgpr, cvtVgprStruct, activationSetPCStruct, activationEnumStrList, - actPCMaxTempSgpr, isInsertActFunctionCallAddrCalc, toActModuleList, - nonEdgeModule, writeLabels[beta][factorDim][vectorWidth][globalWriteMode]["NonEdge"], endLabel, - currentInstLength, betaIdx, fdIdx, vectorDataTypes, factorDims, hasMultipleGlobalWriteModes) + # Keep B0 FD0 NonEdge inline (optimized store path with permute). + # Defer other NonEdge paths when no bias and no MultipleBuffer. + deferNonEdge = ( + allowDeferBase + and not (not beta and factorDim == 0) # B0 FD0: keep inline + and not isMultipleBuffer + and self.states.useBias == DataDirection.NONE + ) + if deferNonEdge: + nonEdgeDeferredModule = Module("NonEdge_B%u_FD%u_VW%u_DeferredBlock" % (beta, factorDim, vectorWidth)) + currentInstLength, activationTypeStr = \ + self.globalWriteElementBatch(kernel, tPA, tPB, activation, + applyAlpha, beta, False, atomic, + vectorWidth, element, activationLabelList, + tmpVgpr, cvtVgprStruct, activationSetPCStruct, activationEnumStrList, + actPCMaxTempSgpr, isInsertActFunctionCallAddrCalc, toActModuleList, + nonEdgeDeferredModule, writeLabels[beta][factorDim][vectorWidth]["NonEdgeDeferred"], + writeLabels[beta][factorDim][vectorWidth]["NonEdgeDeferredReturn"], + currentInstLength, betaIdx, fdIdx, vectorDataTypes, factorDims) + if not hasattr(self.states, 'deferredEdgeModules'): + self.states.deferredEdgeModules = [] + self.states.deferredEdgeModules.append(nonEdgeDeferredModule) + nonEdgeModule = Module("Non_Edge_B%u_FD%u_VW%u" % (beta, factorDim, vectorWidth)) + nonEdgeModule.add(writeLabels[beta][factorDim][vectorWidth][globalWriteMode]["NonEdge"]) + with self.allocTmpSgpr(3) as tmpSgprInfo: + posLabel = self.labels.getNameInc("NonEdgeDeferredDir") + nonEdgeModule.add(SLongBranch(writeLabels[beta][factorDim][vectorWidth]["NonEdgeDeferred"], tmpSgprInfo, posLabel, comment="beta NonEdge store (deferred)")) + nonEdgeModule.addComment0("=" * 60) + nonEdgeModule.addComment0(" NonEdge store B%u FD%u VW%u deferred to after persistent loop" % (beta, factorDim, vectorWidth)) + nonEdgeModule.addComment0(" (would have been inline here in non-deferred version)") + nonEdgeModule.addComment0("=" * 60) + nonEdgeModule.add(writeLabels[beta][factorDim][vectorWidth]["NonEdgeDeferredReturn"]) + with self.allocTmpSgpr(2, alignment=2) as tmpPair: + with self.allocTmpSgpr(1) as tmpOff: + posLabel = self.labels.getNameInc("NonEdgeDeferredReturnDir") + nonEdgeModule.add(SLongBranch(endLabel, tmpPair, tmpOff, posLabel, comment="jump to end")) + else: + nonEdgeModule = Module("Non_Edge_B%u_FD%u_VW%u" % (beta, factorDim, vectorWidth)) + currentInstLength, activationTypeStr = \ + self.globalWriteElementBatch(kernel, tPA, tPB, activation, + applyAlpha, beta, False, atomic, + vectorWidth, element, activationLabelList, + tmpVgpr, cvtVgprStruct, activationSetPCStruct, activationEnumStrList, + actPCMaxTempSgpr, isInsertActFunctionCallAddrCalc, toActModuleList, + nonEdgeModule, writeLabels[beta][factorDim][vectorWidth][globalWriteMode]["NonEdge"], endLabel, + currentInstLength, betaIdx, fdIdx, vectorDataTypes, factorDims) edgeModule.add(nonEdgeModule, pos=0) # NOTE: isEdgeTarget of normal and adaptive kernels are different # Normal kernel: to Then/Else label, followed by edge store # Adaptive kernel: to NonEdgeEnd label, followed by Size0 % vectorWidth check isEdgeTarget = writeLabels[beta][factorDim][vectorWidth][globalWriteMode] - # If module, checking Size1 % MT1 > 0 - isLongBranch = True if currentInstLength >= self.states.asmCaps["ShortBranchMaxLength"] else False + # When UseSubtileImpl is active (and not a multi-buffer GSU accumulation), + # use subtile-aligned edge check: remainder must be a multiple of the + # subtile block size (32 for M, 16 for N) rather than requiring a full tile. + useSubtileEdgeCheck = ( + kernel.get("UseSubtileImpl") + and kernel["_GlobalAccumulation"] not in ("MultipleBufferSingleKernel", "MultipleBuffer") + ) + # If module, checking Size1 % MT1 > 0 (or subtile alignment for N) + # Force long branch when Edge code is deferred + isLongBranch = True if currentInstLength >= 16384 else False with self.allocTmpSgpr(4) as tmpSgprInfo: - checkIsEdge = edgeModule.add(self.checkIsEdge(kernel, tmpSgprInfo, \ - isEdgeTarget["Then"] if kernel["AdaptiveGemm"] == 0 else isEdgeTarget["NonEdgeEnd"], \ + if useSubtileEdgeCheck: + checkIsEdge = edgeModule.add(self.checkIsEdgeSubtile(kernel, tmpSgprInfo, \ + isEdgeTarget["Then"] if kernel["AdaptiveGemm"] == 0 else isEdgeTarget["NonEdgeEnd"], \ + isSize1=True, isLongBranch=isLongBranch), pos=0) + else: + checkIsEdge = edgeModule.add(self.checkIsEdge(kernel, tmpSgprInfo, \ + isEdgeTarget["Then"] if kernel["AdaptiveGemm"] == 0 else isEdgeTarget["NonEdgeEnd"], \ kernel["MacroTile1"], isSize1=True, isLongBranch=isLongBranch), pos=0) currentInstLength += countInstruction(checkIsEdge) - # If module, checking Size0 % MT0 > 0 - isLongBranch = True if currentInstLength >= self.states.asmCaps["ShortBranchMaxLength"] else False + # If module, checking Size0 % MT0 > 0 (or subtile alignment for M) + isLongBranch = True if currentInstLength >= 16384 else False with self.allocTmpSgpr(4) as tmpSgprInfo: - checkIsEdge = edgeModule.add(self.checkIsEdge(kernel, tmpSgprInfo, \ - isEdgeTarget["Else"] if kernel["AdaptiveGemm"] == 0 else isEdgeTarget["NonEdgeEnd"], \ + if useSubtileEdgeCheck: + checkIsEdge = edgeModule.add(self.checkIsEdgeSubtile(kernel, tmpSgprInfo, \ + isEdgeTarget["Else"] if kernel["AdaptiveGemm"] == 0 else isEdgeTarget["NonEdgeEnd"], \ + isSize1=False, isLongBranch=isLongBranch), pos=0) + else: + checkIsEdge = edgeModule.add(self.checkIsEdge(kernel, tmpSgprInfo, \ + isEdgeTarget["Else"] if kernel["AdaptiveGemm"] == 0 else isEdgeTarget["NonEdgeEnd"], \ kernel["MacroTile0"], isLongBranch=isLongBranch), pos=0) currentInstLength += countInstruction(checkIsEdge) betaModule.add(edgeModule, pos=0) @@ -13477,6 +13682,224 @@ def checkIsBetaZero(self, kernel, tmpSgprInfo, betaLabel, isLongBranch=False, pl module.addSpaceLine() return module + ############################################################################## + # checkIsEdgeSubtile + # Used when UseSubtileImpl is active. Checks whether the wave's M/N rows + # are subtile-aligned so the NonEdge paired-store path can be used. + # + # Non-last workgroups always take the NonEdge path (their tile is full). + # For the last workgroup in each dimension, we check that the partial + # remainder is subtile-aligned: + # isSize1=False: (SizeI % MT0) % blockSizeM == 0 → NonEdge (else → edge) + # blockSizeM = 16 for fp32 dest, 32 for 16-bit dest + # isSize1=True : (SizeJ % MT1) % 16 == 0 → NonEdge (else → edge) + # + # tmpSgpr must have at least 4 free SGPRs (same as checkIsEdge). + # isEdgeTarget is the label to branch to when the tile IS an edge. + ############################################################################## + def checkIsEdgeSubtile(self, kernel, tmpSgprInfo, isEdgeTarget, isSize1=False, isLongBranch=False): + assert(isinstance(isEdgeTarget, Label)) + isEdgeTargetLabel = isEdgeTarget.getLabelName() + module = Module("checkIsEdgeSubtile") + dim = "N (isSize1)" if isSize1 else "M" + module.addComment1("Edge/NonEdge store path check (%s): subtile-aligned remainder -> NonEdge paired store; unaligned -> Edge scalar store" % dim) + tmpS0 = tmpSgprInfo.idx + tmpS1 = tmpS0 + 1 + tmpS23 = tmpS1 + 1 + + sizeBoundary = [0, 0] + sizeBoundary[0] = \ + sgpr("PackedSize0") if len(kernel["PackedC0IndicesX"]) > 1 \ + else self.sizeRef(kernel["ProblemType"]["Index0"]) + sizeBoundary[1] = \ + sgpr("PackedSize1") if len(kernel["PackedC1IndicesX"]) > 1 \ + else self.sizeRef(kernel["ProblemType"]["Index1"]) + + if not isSize1: + divisor = kernel["MacroTile0"] + # The M-alignment granularity must be waveGroupM so that every wave in the tile + # has either 0 or a full waveGroupM valid rows. Using only mBlockSize (32 for bf16) + # is insufficient when waveGroupM is not a multiple of mBlockSize (e.g., MIWT3 → 48). + waveGroupM = kernel["MIWaveTile"][0] * kernel["MatrixInstM"] + alignSize = waveGroupM + wgSgpr = "WorkGroup0" + nwgSgpr = "NumWorkGroups0" + # tmpS0 = SizeI % MT0 (the trailing-row count for the last WG) + module.add(scalarStaticDivideAndRemainder(tmpS1, tmpS0, sizeBoundary[0], divisor, + ContinuousRegister(tmpS23, 2), 2)) + # tmpS1 = nwg0 - 1 + module.add(SAddU32(dst=sgpr(tmpS1), src0=hex(-1), src1=sgpr(nwgSgpr))) + # SCC = 1 if this is the last WG in dim 0 + module.add(SCmpGeU32(src0=sgpr(wgSgpr), src1=sgpr(tmpS1), comment="wg0 >= nwg0-1 ?")) + else: + divisor = kernel["MacroTile1"] + # N-dimension: use 16-row alignment (one MIWaveTile row = 16 cols for bf16) + alignSize = 16 + wgSgpr = "WorkGroup1" + nwgSgpr = "NumWorkGroups1" + # tmpS0 = SizeJ % MT1 + module.add(scalarStaticDivideAndRemainder(tmpS1, tmpS0, sizeBoundary[1], divisor, + ContinuousRegister(tmpS23, 2), 2)) + # tmpS1 = nwg1 - 1 + module.add(SAddU32(dst=sgpr(tmpS1), src0=hex(-1), src1=sgpr(nwgSgpr))) + # SCC = 1 if this is the last WG in dim 1 + module.add(SCmpGeU32(src0=sgpr(wgSgpr), src1=sgpr(tmpS1), comment="wg1 >= nwg1-1 ?")) + + # myRem = last WG ? (SizeX % divisor) : 0 + # Non-last WGs always take NonEdge (full tile), so myRem = 0 keeps them out of the edge branch. + module.add(SCSelectB32(dst=sgpr(tmpS0), src0=sgpr(tmpS0), src1=0, + comment="myRem = last WG ? rem : 0")) + + # Check alignment: myRem % alignSize != 0 → edge. + # alignSize is a compile-time value. For power-of-2 use AND; for non-power-of-2 + # (e.g., waveGroupM=48 from MIWT3), enumerate the valid multiples and branch-chain. + if alignSize & (alignSize - 1) == 0: + # Power of 2: use AND for fast modulo + module.add(SAndB32(dst=sgpr(tmpS0), src0=sgpr(tmpS0), src1=alignSize - 1, + comment="myRem %% %d (subtile alignment check)" % alignSize)) + module.add(self.getSCMPKInstruction("GTU32", tmpS0, 0, + comment="not subtile-aligned → edge")) + if isLongBranch: + module.add(self.longBranchScc1(isEdgeTarget, posNeg=1, tmpSgprInfo=tmpSgprInfo, + comment="jump to edge if not subtile-aligned")) + else: + module.add(SCBranchSCC1(labelName=isEdgeTargetLabel, + comment="jump to edge if not subtile-aligned")) + else: + # Non-power-of-2: enumerate valid aligned multiples {0, alignSize, 2*alignSize, ...}. + # myRem is in [0, divisor-1]; divisor = alignSize * numWaves. At most numWaves values to check. + # Strategy: branch to Edge if NOT any valid multiple. + # We emit: for each valid k: if myRem==k*alignSize goto NonEdge + # fall-through → Edge + if not isSize1: + numWaves = kernel["MIWaveGroup"][0] + else: + numWaves = kernel["MIWaveGroup"][1] + nonEdgeLabel = Label(self.labels.getNameInc("subtile_nonedge_aligned"), + "myRem is a valid multiple of alignSize=%d" % alignSize) + for k in range(numWaves): + multiple = k * alignSize + module.add(self.getSCMPKInstruction("EQU32", tmpS0, multiple, + comment="myRem == %d (aligned multiple k=%d)?" % (multiple, k))) + module.add(SCBranchSCC1(labelName=nonEdgeLabel.getLabelName(), + comment="aligned → NonEdge")) + # Not any valid multiple → Edge + if isLongBranch: + module.add(self.longBranchScc0(isEdgeTarget, posNeg=1, tmpSgprInfo=tmpSgprInfo, + comment="not subtile-aligned (alignSize=%d) → edge" % alignSize)) + else: + module.add(SBranch(labelName=isEdgeTargetLabel, + comment="not subtile-aligned (alignSize=%d) → edge" % alignSize)) + module.add(nonEdgeLabel) + return module + + ############################################################################## + # _emitSubtileGuards + # Compute both M and N OOB guard SGPRs for the UseSubtileImpl NonEdge path. + # Results are stored in self.states.subtileM32ValidBlocksSgpr and + # self.states.subtileN16ValidBlocksSgpr for use by _emitSubtileOobGuard. + # + # waveId (serial >> 6) is read once and used for both dimensions: + # waveIdM = waveId & (numWavesM - 1) lower bits (M is innermost) + # waveIdN = waveId >> log2(numWavesM) upper bits (N is outermost) + # + # M algorithm: + # validM = SizeI - WG0 * MT0 + # waveBase = waveIdM * waveGroupM + # remainder = max(validM - waveBase, 0) + # numValidMBlocks = min(ceil(remainder / mBlockSize), MIWaveTile[0]) + # mBlockSize = 32 for 16-bit dest, 16 for fp32 dest. + # + # N algorithm: + # validN = SizeJ - WG1 * MT1 + # waveBaseN = waveIdN * waveGroupN (skipped if numWavesN == 1) + # validN_wave = max(validN - waveBaseN, 0) + # clamped = min(validN_wave, waveGroupN) + # numValid16NBlocks = clamped >> 4 + ############################################################################## + def _emitSubtileGuards(self, kernel, edgeModule): + numWavesM = kernel["MIWaveGroup"][0] + numWavesN = kernel["MIWaveGroup"][1] + log2numWavesM = int(log(numWavesM, 2)) + mBlockSize = 16 if kernel["ProblemType"]["DestDataType"].isSingle() else 32 + mBlockShift = int(log(mBlockSize, 2)) + waveGroupM = kernel["MIWaveTile"][0] * kernel["MatrixInstM"] + waveGroupN = kernel["MIWaveTile"][1] * kernel["MatrixInstN"] + mt0, mt1 = kernel["MacroTile0"], kernel["MacroTile1"] + + # Use pre-allocated permanent guard SGPRs (allocated at start of post-loop). + assert self.states.subtileM32ValidBlocksSgpr is not None, \ + "SubtileMGuard must be pre-allocated before _emitSubtileGuards" + tmpM = self.sgprPool.checkOut(1, "subtileWaveIdM") + tmpN = self.sgprPool.checkOut(1, "subtileWaveIdN") + + edgeModule.addComment1("UseSubtileImpl NonEdge guards: numValidD1Steps (MatrixInstM=%d) and numValid16NBlocks" % kernel["MatrixInstM"]) + + # Read waveId once; extract M (lower bits) and N (upper bits) before AND destroys waveId. + edgeModule.add(VReadfirstlaneB32(dst=sgpr(tmpM), src=vgpr("Serial"), + comment="lane 0 serial of this wave")) + edgeModule.add(SLShiftRightB32(dst=sgpr(tmpM), src=sgpr(tmpM), + shiftHex=6, comment="waveId = serial >> 6")) + if numWavesN > 1: + edgeModule.add(SLShiftRightB32(dst=sgpr(tmpN), src=sgpr(tmpM), + shiftHex=log2numWavesM, + comment="waveIdN = waveId >> log2(numWavesM=%d)" % numWavesM)) + # MIWaveGroup[0] is always a power of 2, so AND is correct for modulo. + edgeModule.add(SAndB32(dst=sgpr(tmpM), src0=sgpr(tmpM), src1=numWavesM - 1, + comment="waveIdM = waveId & (numWavesM-1=%d)" % (numWavesM - 1))) + + # --- M guard --- + # Each d1 step in the C-load batch corresponds to MatrixInstM rows. + # We compute numValidD1Steps = min(ceil(max(validM-waveBase,0)/MatrixInstM), MIWaveTile[0]). + # The guard check is (numValidD1Steps > d1): true iff this wave's d1-th block has valid rows. + miM = kernel["MatrixInstM"] + miMShift = int(log(miM, 2)) + edgeModule.addComment0("M-guard: numValidD1Steps = min(ceil(max(validM-waveBase,0)/%d), MIWaveTile[0]=%d)" % (miM, kernel["MIWaveTile"][0])) + edgeModule.add(SMulI32(dst=sgpr("SubtileMGuard"), src0=sgpr("WorkGroup0"), src1=mt0, + comment="WG0 * MT0")) + edgeModule.add(SSubU32(dst=sgpr("SubtileMGuard"), src0=sgpr("SizeI"), src1=sgpr("SubtileMGuard"), + comment="validM = SizeI - WG0*MT0")) + edgeModule.add(SMulI32(dst=sgpr(tmpM), src0=sgpr(tmpM), src1=waveGroupM, + comment="waveBase = waveIdM * waveGroupM(%d)" % waveGroupM)) + edgeModule.add(SSubU32(dst=sgpr("SubtileMGuard"), src0=sgpr("SubtileMGuard"), src1=sgpr(tmpM), + comment="validM - waveBase; SCC=1 if OOB")) + edgeModule.add(SCSelectB32(dst=sgpr("SubtileMGuard"), src0=0, src1=sgpr("SubtileMGuard"), + comment="remainder = 0 if OOB")) + edgeModule.add(SAddU32(dst=sgpr("SubtileMGuard"), src0=sgpr("SubtileMGuard"), src1=miM - 1, + comment="ceil: remainder + (%d-1)" % miM)) + edgeModule.add(SLShiftRightB32(dst=sgpr("SubtileMGuard"), src=sgpr("SubtileMGuard"), shiftHex=miMShift, + comment="numValidD1Steps = ceil(remainder / %d)" % miM)) + # Clamp: guard comparison is (numValidD1Steps > d1); d1 < MIWaveTile[0] always. + edgeModule.add(SMinU32(dst=sgpr("SubtileMGuard"), src0=sgpr("SubtileMGuard"), src1=kernel["MIWaveTile"][0], + comment="clamp to MIWaveTile[0]=%d" % kernel["MIWaveTile"][0])) + self.sgprPool.checkIn(tmpM) + + # --- N guard --- + edgeModule.addComment0("N-guard: numValid16NBlocks = min(max(validN-waveBaseN,0), waveGroupN=%d) >> 4" % waveGroupN) + edgeModule.add(SMulI32(dst=sgpr("SubtileNGuard"), src0=sgpr("WorkGroup1"), src1=mt1, + comment="WG1 * MT1")) + edgeModule.add(SSubU32(dst=sgpr("SubtileNGuard"), + src0=self.sizeRef(kernel["ProblemType"]["Index1"]), + src1=sgpr("SubtileNGuard"), + comment="validN = SizeJ - WG1*MT1")) + if numWavesN > 1: + edgeModule.add(SMulI32(dst=sgpr(tmpN), src0=sgpr(tmpN), src1=waveGroupN, + comment="waveBaseN = waveIdN * waveGroupN(%d)" % waveGroupN)) + edgeModule.add(SSubU32(dst=sgpr("SubtileNGuard"), src0=sgpr("SubtileNGuard"), src1=sgpr(tmpN), + comment="validN - waveBaseN; SCC=1 if OOB")) + edgeModule.add(SCSelectB32(dst=sgpr("SubtileNGuard"), src0=0, src1=sgpr("SubtileNGuard"), + comment="validN_wave = 0 if OOB")) + # clamped = min(validN_wave, waveGroupN); SCC=1 on borrow → keep validN_wave, else waveGroupN. + edgeModule.add(SSubU32(dst=sgpr(tmpN), src0=sgpr("SubtileNGuard"), src1=waveGroupN, + comment="validN_wave - waveGroupN; SCC=1 if validN_wave < waveGroupN")) + edgeModule.add(SCSelectB32(dst=sgpr("SubtileNGuard"), src0=sgpr("SubtileNGuard"), src1=waveGroupN, + comment="min(validN_wave, waveGroupN)")) + edgeModule.add(SLShiftRightB32(dst=sgpr("SubtileNGuard"), src=sgpr("SubtileNGuard"), shiftHex=4, + comment="numValid16NBlocks = clamped >> 4")) + self.sgprPool.checkIn(tmpN) + + self.states.subtileMBlockSize = mBlockSize + ############################################################################## # checkIsEdge # tmpSgpr must have at least 4 free SGPR @@ -13486,6 +13909,8 @@ def checkIsEdge(self, kernel, tmpSgprInfo, isEdgeTarget, divisor, isSize1=False, assert(isinstance(isEdgeTarget, Label)) isEdgeTargetLabel = isEdgeTarget.getLabelName() module = Module("checkIsEdge") + dim = "N (isSize1)" if isSize1 else "M" + module.addComment1("Edge/NonEdge store path check (%s): Size %% %d > 0 -> Edge store; else -> NonEdge store" % (dim, divisor)) tmpS0 = tmpSgprInfo.idx tmpS1 = tmpS0 + 1 tmpS23 = tmpS1 + 1 @@ -13569,10 +13994,14 @@ def checkIsFactorDimZero(self, kernel, tmpSgprInfo, factorDimLabel, isLongBranch # Global Write Elements ############################################################################## class BF16CVTVgprStruct(NamedTuple): # class for bf16 vgprs - vgprBf16Temp: int = -1 - vgprBf16Mask: int = -1 - vgprFp32Nan: int = -1 - vgprBf16Inc: int = -1 + vgprBf16Temp: int = -1 # rounding bias constant (0x7fff) for standard bf16 pack path + vgprBf16Mask: int = -1 # mask constant (0xffff0000) for extracting bf16 bits from f32 + vgprFp32Nan: int = -1 # NaN sentinel used during bf16 saturation/rounding + vgprBf16Inc: int = -1 # increment constant for bf16 rounding (standard pack path) + # UseSubtileImpl paired dwordx4 store extras (+0..+3 above are reused as pack/perm staging): + vgprPermAddr: int = -1 # per-lane ds_bpermute byte address (partner_lane*4); constant for the whole batch + vgprLaneGroupDelta: int = -1 # per-lane lane_group*8: M-row byte offset added to addrDVgpr for the dwordx4 store + vgprAddrScratch: int = -1 # per-store scratch: holds (addrDVgpr scaled + lane_group*8) without modifying addrDVgpr class FP8CVTVgprStruct(NamedTuple): vgprFp8NanInf: int = -1 @@ -13603,8 +14032,9 @@ def globalWriteElements(self, kernel, tPA, tPB, vectorWidths_2, vectorWidths_1, betas=None, # if left unspecified, then let global parameter decide edge=True # defaults to using edge write ): - if not self.do["PostLoop"]: return Module("GlobalWriteElements (Empty)") + if not self.do["PostLoop"]: return Module("GlobalWriteElements (Empty)"), Module("DeferredGSU0 (Empty)") module = Module("GlobalWriteElements") + deferredGSU0 = Module("DeferredGSU0") module.addComment2("Global Write Elements") if kernel["ProblemType"]["OutputAmaxD"]: @@ -13636,10 +14066,22 @@ def globalWriteElements(self, kernel, tPA, tPB, vectorWidths_2, vectorWidths_1, vectorWidths_1 = [vectorWidths_1[0], vectorWidths_1[-1]] elements_1 = [elements_1[0], elements_1[-1]] + # GSU0 always sets useBias=NONE (no bias in workspace writes), so no barrier issue. + deferGSU0 = ( + kernel.get("UseSubtileImpl") + and kernel.get("StreamK", 0) > 0 + ) + gsu0DeferredLabel = None + gsu0ReturnLabel = None + gsuLimit = 1 if noGSUBranch or self.debugConfig.splitGSU else 2 if gsuLimit > 1: gsuLabel = Label(label=self.labels.getNameInc("GSU"), comment="") if kernel["StreamK"]: + if deferGSU0: + gsu0DeferredLabel = Label(label=self.labels.getNameInc("GW_B0_Deferred"), comment="") + gsu0ReturnLabel = Label(label=self.labels.getNameInc("GW_B0_Deferred_Return"), comment="") + # Keep original GSU check unchanged — falls through to GSU0, branches to gsuLabel for GSU1 module.add(SCmpEQU64(src0=sgpr("AddressFlags", 2), src1=hex(0), comment="Check for synchronizer")) module.add(SCBranchSCC0(labelName=gsuLabel.getLabelName(), comment="Branch to stream-k store code")) sSkt = self.acquireStreamKConstSgpr(kernel, "skTiles") @@ -13660,6 +14102,23 @@ def globalWriteElements(self, kernel, tPA, tPB, vectorWidths_2, vectorWidths_1, gsuLimitRange = range(0, gsuLimit) # generate GSU1 and GSUM label for gsuLimitIdx in gsuLimitRange: + # Redirect GSU0 output to deferred module, keeping original label as stub + if deferGSU0 and gsuLimitIdx == 0: + # Keep original GW_B0 label inline as a stub with jump to deferred + gsu0InlineLabel = Label(label=self.labels.getNameInc("GW_B0"), comment="") + module.add(gsu0InlineLabel) + with self.allocTmpSgpr(2, alignment=2) as tmpPair: + with self.allocTmpSgpr(1) as tmpOff: + module.add(SLongBranchPositive(gsu0DeferredLabel, tmpPair, tmpOff, comment="GSU0 reduction (deferred)")) + module.addComment0("=" * 60) + module.addComment0(" GSU0 reduction block deferred to after persistent loop") + module.addComment0(" (would have been inline here in non-deferred version)") + module.addComment0("=" * 60) + module.add(gsu0ReturnLabel) + # Redirect code generation to deferred module + savedModule = module + module = Module("GSU0_DeferredBlock") + module.add(gsu0DeferredLabel) if gsuLimit > 1: betas = betasBackup if gsuLimitIdx == 0: @@ -14069,6 +14528,12 @@ def globalWriteElements(self, kernel, tPA, tPB, vectorWidths_2, vectorWidths_1, writeLabels[beta][factorDim][vectorWidth]["NonEdgeEnd"] = Label(self.labels.getNameInc("GW_B%u_FD%u_VW%u_NonEdgeEnd" % (beta, factorDim, vectorWidth) ), "") writeLabels[beta][factorDim][vectorWidth]["Then"] = Label(self.labels.getNameInc("GW_B%u_FD%u_VW%u_Then" % (beta, factorDim, vectorWidth) ), "") writeLabels[beta][factorDim][vectorWidth]["Else"] = Label(self.labels.getNameInc("GW_B%u_FD%u_VW%u_Else" % (beta, factorDim, vectorWidth) ), "") + writeLabels[beta][factorDim][vectorWidth]["ThenDeferred"] = Label(self.labels.getNameInc("GW_B%u_FD%u_VW%u_Then_Deferred" % (beta, factorDim, vectorWidth) ), "") + writeLabels[beta][factorDim][vectorWidth]["ThenDeferredReturn"] = Label(self.labels.getNameInc("GW_B%u_FD%u_VW%u_Then_Deferred_Return" % (beta, factorDim, vectorWidth) ), "") + writeLabels[beta][factorDim][vectorWidth]["ElseDeferred"] = Label(self.labels.getNameInc("GW_B%u_FD%u_VW%u_Else_Deferred" % (beta, factorDim, vectorWidth) ), "") + writeLabels[beta][factorDim][vectorWidth]["ElseDeferredReturn"] = Label(self.labels.getNameInc("GW_B%u_FD%u_VW%u_Else_Deferred_Return" % (beta, factorDim, vectorWidth) ), "") + writeLabels[beta][factorDim][vectorWidth]["NonEdgeDeferred"] = Label(self.labels.getNameInc("GW_B%u_FD%u_VW%u_NonEdge_Deferred" % (beta, factorDim, vectorWidth) ), "") + writeLabels[beta][factorDim][vectorWidth]["NonEdgeDeferredReturn"] = Label(self.labels.getNameInc("GW_B%u_FD%u_VW%u_NonEdge_Deferred_Return" % (beta, factorDim, vectorWidth) ), "") endLabel = Label(self.labels.getNameInc("GW_End"), "") # Layout @@ -14136,10 +14601,31 @@ def globalWriteElements(self, kernel, tPA, tPB, vectorWidths_2, vectorWidths_1, cvtVgprStruct = None cvtVgpr = None - if kernel["ProblemType"]["DestDataType"].isBFloat16() and kernel["ProblemType"]["HighPrecisionAccumulate"]: - cvtVgpr = self.vgprPool.checkOut(4) + is16bitHPA = (kernel["ProblemType"]["DestDataType"].isBFloat16() or + kernel["ProblemType"]["DestDataType"].isHalf()) and \ + kernel["ProblemType"]["HighPrecisionAccumulate"] + if is16bitHPA: + # For UseSubtileImpl, allocate 7 vgprs with 2-alignment (64-bit aligned) so + # that the first 4 (reused as pack scratch for the paired 16bit store) satisfy + # the buffer_store_dwordx4 alignment requirement. Any pool vgpr skipped for + # alignment becomes a hole that subsequent element-address checkouts fill, so + # pool.size() (= startVgprValu) stays within budget for large macro-tiles. + # If a very large tile causes accvgpr staging to exceed 256 vgprs despite the + # alignment overhead, reduce the batch via NumElementsPerBatchStore. + # +0..+3: scratch for pack output + ds_bpermute + v_permlane32_swap + # (reuses vgprBf16Temp/Mask/Nan/Inc slots; constants written at batch + # start are overwritten by the packed 16bit values before the store) + # +4: vgprPermAddr — ds_permute partner-lane byte address + # +5: vgprLaneGroupDelta — lane_group*8, pre-computed once per batch + # +6: vgprAddrScratch — per-store adjusted D address; avoids modifying addrDVgpr + numCvtVgprs = 7 if kernel.get("UseSubtileImpl") else 4 + cvtAlign = 2 if kernel.get("UseSubtileImpl") else 1 + cvtVgpr = self.vgprPool.checkOutAligned(numCvtVgprs, cvtAlign) cvtVgprStruct = self.BF16CVTVgprStruct(vgprBf16Temp=cvtVgpr, vgprBf16Mask=(cvtVgpr+1), \ - vgprFp32Nan=(cvtVgpr+2), vgprBf16Inc=(cvtVgpr+3)) + vgprFp32Nan=(cvtVgpr+2), vgprBf16Inc=(cvtVgpr+3), \ + vgprPermAddr=(cvtVgpr+4) if kernel.get("UseSubtileImpl") else -1, \ + vgprLaneGroupDelta=(cvtVgpr+5) if kernel.get("UseSubtileImpl") else -1, \ + vgprAddrScratch=(cvtVgpr+6) if kernel.get("UseSubtileImpl") else -1) elif kernel["ProblemType"]["DestDataType"].isAnyFloat8() and kernel["ProblemType"]["HighPrecisionAccumulate"]: cvtVgpr = self.vgprPool.checkOut(4) cvtVgprStruct = self.FP8CVTVgprStruct(vgprFp8Temp=cvtVgpr, vgprFp8NanInf=(cvtVgpr+1), \ @@ -14315,7 +14801,12 @@ def globalWriteElements(self, kernel, tPA, tPB, vectorWidths_2, vectorWidths_1, activationModules = self.generateActivationModules( kernel, activation, activationLabelList, activationEnumStrList, activationSetPCStruct, tmpVgpr, actPCGwvwVgpr, actTempSgpr) - module.appendModule(activationModules) + # Defer activation blocks to end of kernel when other blocks are deferred + # (called via s_setpc/s_swappc, position-independent). + if kernel.get("UseSubtileImpl"): + self.states.deferredActivationModules = activationModules + else: + module.appendModule(activationModules) self.sgprPool.checkIn(activationSetPCStruct.sgprOffsetActivation) self.sgprPool.checkIn(activationSetPCStruct.sgprOffsetBack) @@ -14337,13 +14828,24 @@ def globalWriteElements(self, kernel, tPA, tPB, vectorWidths_2, vectorWidths_1, if cvtVgpr is not None: self.vgprPool.checkIn(cvtVgpr) if gsuLimit > 1 and gsuLimitIdx == 0: - with self.allocTmpSgpr(3) as tmpSgprInfo: - module.add(SLongBranchPositive(Label("KernelEnd", ""), tmpSgprInfo)) + if deferGSU0: + # GSU0 store code is done. Append it to deferredGSU0 (placed after persistent loop), + # then restore `module` to savedModule (the inline stub region) so subsequent code + # (e.g. the SLongBranchPositive to KernelEnd) lands inline, not in the deferred block. + # The deferred block falls through to GW_End -> KernelEnd -> s_endpgm directly, + # so no explicit return branch back to inline is needed. + deferredGSU0.appendModule(module) + module = savedModule + with self.allocTmpSgpr(3) as tmpSgprInfo: + module.add(SLongBranchPositive(Label("KernelEnd", ""), tmpSgprInfo, comment="GSU0 done, skip to end")) + else: + with self.allocTmpSgpr(3) as tmpSgprInfo: + module.add(SLongBranchPositive(Label("KernelEnd", ""), tmpSgprInfo)) kernel["GlobalSplitU"] = gsuBackup kernel["_GlobalAccumulation"] = gsuAccumBackup self.states.bpeCexternal = bpeCexternalBackup - return module + return module, deferredGSU0 def getMBSKGSUTotal(self, kernel): if kernel["MbskPrefetchMethod"]: @@ -14399,7 +14901,24 @@ def refineOccupancy(self, kernel, atomic, element, actPCMaxTempSgpr, \ if ss.numVgprsPerElement: numElementsPerBatch = numVgprAvailable // ss.numVgprsPerElement else: - numElementsPerBatch = len(element) # max, do 'em all + # numVgprsPerElement==0: accvgprs are pre-staged (e.g. UseSubtileImpl) so no pool + # vgprs are needed per element. Default to the full element list; use + # NumElementsPerBatchStore to cap the batch size (e.g. for very large macro-tiles + # where a smaller batch reduces register pressure or improves store pipelining). + numElementsPerBatch = len(element) + + # Cap batch size to align on MIWaveTile[0] (M-tile) boundaries. + # The acc-to-VGPR mapping interleaves M and N tiles, so a batch that + # partially covers an N-column still touches the full acc range of that + # column. Aligning to MIWaveTile[0] ensures batches break on N-column + # boundaries, avoiding accesses beyond the ValuC range. + if kernel.get("UseSubtileImpl") and kernel.get("EnableMatrixInstruction"): + miwt0 = kernel["MIWaveTile"][0] + totalElems = kernel["MIWaveTile"][0] * kernel["MIWaveTile"][1] + if numElementsPerBatch >= totalElems: + numElementsPerBatch = totalElems + elif miwt0 > 1 and numElementsPerBatch >= miwt0: + numElementsPerBatch = (numElementsPerBatch // miwt0) * miwt0 assert(self.states.c.numVgprValu % gwvw == 0) # sanity check @@ -14411,10 +14930,20 @@ def refineOccupancy(self, kernel, atomic, element, actPCMaxTempSgpr, \ numElementsPerBatch = ss.cfg.numElementsPerBatchLimitedBySgprs # TODO: Which of DataType or DestDataType is in a better sense? 0114: Check Using DestDataType + HSS - if (kernel["ProblemType"]["DataType"].isHalf() or kernel["ProblemType"]["DataType"].isBFloat16()): + destType = kernel["ProblemType"]["DestDataType"] + srcType = kernel["ProblemType"]["DataType"] + subtileImplDest16b = kernel.get("UseSubtileImpl") and (destType.isHalf() or destType.isBFloat16()) + if (srcType.isHalf() or srcType.isBFloat16() or subtileImplDest16b): # only do an even number of halves - since these share hi/lo pieces of some registers? if numElementsPerBatch > 1: numElementsPerBatch = int(numElementsPerBatch/2)*2 + # UseSubtileImpl paired-store: batch must be aligned to MIWaveTile[0] + # (the number of M-tiles per N-column) so that batch boundaries don't + # split sba=0/sba=1 pairs within an N-column. + if kernel.get("UseSubtileImpl") and kernel["MIWaveTile"][0] > 1: + miwt0 = kernel["MIWaveTile"][0] + if numElementsPerBatch >= miwt0: + numElementsPerBatch = (numElementsPerBatch // miwt0) * miwt0 # dot2: no this constraint elif not kernel["EnableMatrixInstruction"] and not kernel["UseDotInstruction"]: # The globalWriteBatch routine below can't handle odd elements per batch @@ -14502,7 +15031,7 @@ def globalWriteElementBatch(self, kernel, tPA, tPB, activation, \ actPCMaxTempSgpr, isInsertActFunctionCallAddrCalc, toActModuleList, \ edgeModule, writeLabel, endLabel, \ currentInstLength, \ - betaIdx, fdIdx, vectorDataTypes, factorDims, hasMultipleGlobalWriteModes): + betaIdx, fdIdx, vectorDataTypes, factorDims, hasMultipleGlobalWriteModes=False): factorDim = factorDims[fdIdx] edgeModule.add(writeLabel) @@ -14546,6 +15075,7 @@ def globalWriteElementBatch(self, kernel, tPA, tPB, activation, \ ss = StoreState(self, kernel, gwvw, edge, beta, atomic, element, vectorDataTypes, dim=factorDim) + actPCMaxTempSgpr_ = None if activationLabelList and isInsertActFunctionCallAddrCalc: assert activationSetPCStruct, activationEnumStrList and activationLabelList and toActModuleList @@ -14571,6 +15101,24 @@ def globalWriteElementBatch(self, kernel, tPA, tPB, activation, \ #edgeModule.addComment("storeStats, %d, %d, %d"% (edge, numSgprs, numElementsPerBatch)) # so if we don't have *GPR resources to handle a larger batch then need # to mark overflowedResources rather than generate a kernel that won't work. + + # UseSubtileImpl NonEdge guard: compute numValidMBlocks / numValidNBlocks so + # stores can skip OOB wave groups. Active for any NonEdge UseSubtileImpl path + # that is not multi-buffer GSU accumulation. + isSubtileNonEdge = ( + not edge + and kernel.get("UseSubtileImpl") + and kernel["_GlobalAccumulation"] not in ("MultipleBufferSingleKernel", "MultipleBuffer") + ) + if isSubtileNonEdge: + self._emitSubtileGuards(kernel, edgeModule) + else: + # Don't clear permanent guard SGPRs — they persist across batches + if "SubtileMGuard" not in self.sgprs: + self.states.subtileM32ValidBlocksSgpr = None + self.states.subtileN16ValidBlocksSgpr = None + self.states.subtileMBlockSize = 0 + # Activation actLoopEndLabel, actLoopLabelModules, actLoopEnumStrList = self.initActivationLoop(kernel, beta) actLoopModuleList = [] @@ -14631,7 +15179,15 @@ def globalWriteElementBatch(self, kernel, tPA, tPB, activation, \ actLoopModuleCodeLength.append(countInstruction(actLoopModule)) ################# - # Free after final vgpr vcalculation + # Free after final vgpr calculation + # Only free locally-allocated guard SGPRs, not permanent ones (SubtileMGuard). + if self.states.subtileM32ValidBlocksSgpr is not None and "SubtileMGuard" not in self.sgprs: + self.sgprPool.checkIn(self.states.subtileM32ValidBlocksSgpr) + self.sgprPool.checkIn(self.states.subtileN16ValidBlocksSgpr) + self.states.subtileM32ValidBlocksSgpr = None + self.states.subtileN16ValidBlocksSgpr = None + self.states.subtileMBlockSize = 0 + if tmpVgprDynamic: self.vgprPool.checkIn(tmpVgprDynamic.idx) @@ -14665,10 +15221,12 @@ def globalWriteElementBatch(self, kernel, tPA, tPB, activation, \ if len(actLoopLabelModules) > 1: edgeModule.add(actLoopEndLabel) - if len(factorDims) == 1 and not hasMultipleGlobalWriteModes: - if currentInstLength >= self.states.asmCaps["ShortBranchMaxLength"]: + if len(factorDims) == 1: + isDeferredReturn = "Deferred" in endLabel.getLabelName() + if currentInstLength >= self.states.asmCaps["ShortBranchMaxLength"] or isDeferredReturn: + posLabel = self.labels.getNameInc("DeferredReturnDir") with self.allocTmpSgpr(3) as tmpSgprInfo: - edgeModule.add(SLongBranchPositive(endLabel, tmpSgprInfo, comment="jump to end")) + edgeModule.add(SLongBranch(endLabel, tmpSgprInfo, posLabel, comment="jump to end")) else: edgeModule.add(SBranch(labelName=endLabel.getLabelName(), comment="jump to end")) else: diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterModules.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterModules.py index c9c9e78ff66..c1383351107 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterModules.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterModules.py @@ -212,7 +212,7 @@ def accVgprImagNumOffset(kernel): # MapAcctoArch # function to map MFMA Acc Registers to Arch VGPR register ############################################################################## -def mapAcctoArchRegs(kernel, maxAgpr=256, write=False): +def mapAcctoArchRegs(kernel, maxAgpr=256, write=False, spilledVgprBase=None): acc2arch, _ = accToArchMapper(kernel) complexMultiplier = 2 if kernel["ProblemType"]["DataType"].isComplex() else 1 @@ -231,13 +231,23 @@ def gprfunc(idx): return accvgpr(idx) accStr = gprfunc(srcIdx) if srcIdx >= maxAgpr: + # Spilled accumulator: lives in an arch vgpr, not an accvgpr. + # For subtile kernels the spilled D-tile vgprs are allocated from + # the pool at spilledVgprBase (not at ValuC+N), so reference them + # directly. For non-subtile kernels spilledVgprBase is None and + # the legacy "ValuC+N" addressing is used (vgprValuC == 0 there). + spill_offset = srcIdx - maxAgpr + if spilledVgprBase is not None: + spilledVgpr = vgpr(spilledVgprBase + spill_offset) + else: + spilledVgpr = vgpr("ValuC+%u" % spill_offset) if write: - itemList[destIdx] = VMovB32(dst=vgpr("ValuC+%u"%(srcIdx-maxAgpr)), + itemList[destIdx] = VMovB32(dst=spilledVgpr, src=vgpr(Holder(name="ValuC")), comment="copy vreg[%u] to MI out reg" % destIdx) else: itemList[destIdx] = VMovB32(dst=vgpr(Holder(name="ValuC")), - src=vgpr("ValuC+%u"%(srcIdx-maxAgpr)), + src=spilledVgpr, comment="copy MI out reg to vreg[%u]" % destIdx) else: if write: diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Problem.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Problem.py index f266140cefc..9fac0e62ebe 100644 --- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Problem.py +++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Problem.py @@ -512,119 +512,86 @@ def getRealDataTypeB(dataType): # Cinternal: basically should == ComputeDataType # This is used in _checkIfSupportedGEMMType() _validGEMMTypes = [ - ("H", "H", "H"), - ("S", "S", "S"), - ("D", "D", "D"), - ("C", "C", "C"), - ("Z", "Z", "Z"), - ("H", "H", "S"), - ("H", "S", "S"), - ("B", "B", "S"), - ("B", "S", "S"), - ("B", "H", "S"), - ("I8", "I", "I"), - ("4xi8", "I", "I"), - ("I8", "I8", "I"), - ("I8", "I", "S"), - ("I8", "I8", "S"), - ("I8", "H", "S"), - ("I8", "B", "S"), - ("F8", "S", "S"), - ("B8", "S", "S"), - ("F8B8", "S", "S"), - ("B8F8", "S", "S"), - ("F8", "H", "S"), - ("B8", "H", "S"), - ("F8B8", "H", "S"), - ("B8F8", "H", "S"), - ("B8", "B", "S"), - ("H", "F8", "S"), - ("F8", "B", "S"), - ("F8B8", "B", "S"), - ("B8F8", "B", "S"), # in/out are both R8 - ("F8", "F8", "S"), - ("B8", "B8", "S"), - ("F8B8", "B8", "S"), - ("B8F8", "B8", "S"), - ("F8", "B8", "S"), - ("B8", "F8", "S"), - ("F8B8", "F8", "S"), - ("B8F8", "F8", "S"), # F8 NANOO - ("F8N", "S", "S"), - ("B8N", "S", "S"), - ("F8B8N", "S", "S"), - ("B8F8N", "S", "S"), - ("F8N", "H", "S"), - ("B8N", "H", "S"), - ("F8B8N", "H", "S"), - ("B8F8N", "H", "S"), - ("B8N", "B", "S"), - ("H", "F8N", "S"), - ("F8N", "B", "S"), - ("F8B8N", "B", "S"), - ("B8F8N", "B", "S"), # in/out are both R8 - ("F8N", "F8N", "S"), - ("B8N", "B8N", "S"), - ("F8B8N", "B8N", "S"), - ("B8F8N", "B8N", "S"), - ("F8N", "B8N", "S"), - ("B8N", "F8N", "S"), - ("F8B8N", "F8N", "S"), - ("B8F8N", "F8N", "S"), - ("F4", "S", "S"), - ("F6", "S", "S"), - ("B6", "S", "S"), -] - -_validMXGEMMTypes = [ - ("F8", "S", "S"), - ("F8B8", "S", "S"), - ("B8", "S", "S"), - ("B8F8", "S", "S"), - ("F6", "S", "S"), - ("F6B6", "S", "S"), - ("B6", "S", "S"), - ("B6F6", "S", "S"), - ("F4", "S", "S"), - ("F8", "H", "S"), - ("F8B8", "H", "S"), - ("B8", "H", "S"), - ("B8F8", "H", "S"), - ("F6", "H", "S"), - ("F6B6", "H", "S"), - ("B6", "H", "S"), - ("B6F6", "H", "S"), - ("F4", "H", "S"), - ("F8", "B", "S"), - ("F8B8", "B", "S"), - ("B8", "B", "S"), - ("B8F8", "B", "S"), - ("F6", "B", "S"), - ("F6B6", "B", "S"), - ("B6", "B", "S"), - ("B6F6", "B", "S"), - ("F4", "B", "S"), - ("F8", "F8", "S"), - ("F8B8", "F8", "S"), - ("B8", "F8", "S"), - ("B8F8", "F8", "S"), - ("F6", "F8", "S"), - ("F6B6", "F8", "S"), - ("B6", "F8", "S"), - ("B6F6", "F8", "S"), - ("F4", "F8", "S"), - ("F8", "B8", "S"), - ("F8B8", "B8", "S"), - ("B8", "B8", "S"), - ("B8F8", "B8", "S"), - ("F6", "B8", "S"), - ("F6B6", "B8", "S"), - ("B6", "B8", "S"), - ("B6F6", "B8", "S") -] - -_validMXGEMMBlock = [ - 16, 32 + ("H", "H", "H", "H"), + ("S", "S", "S", "S"), + ("D", "D", "D", "D"), + ("C", "C", "C", "C"), + ("Z", "Z", "Z", "Z"), + ("H", "H", "H", "S"), + ("H", "H", "S", "S"), + ("B", "B", "B", "S"), + ("B", "B", "S", "S"), + ("B", "B", "H", "S"), + ("I8", "I8", "I", "I"), + ("4xi8", "4xi8", "I", "I"), + ("I8", "I8", "I8", "I"), + ("I8", "I8", "I", "S"), + ("I8", "I8", "I8", "S"), + ("I8", "I8", "H", "S"), + ("I8", "I8", "B", "S"), + ("F8", "F8", "S", "S"), + ("B8", "B8", "S", "S"), + ("F8", "B8", "S", "S"), + ("B8", "F8", "S", "S"), + ("F8", "F8", "H", "S"), + ("B8", "B8", "H", "S"), + ("F8", "B8", "H", "S"), + ("B8", "F8", "H", "S"), + ("B8", "B8", "B", "S"), + ("H", "H", "F8", "S"), + ("F8", "F8", "B", "S"), + ("F8", "B8", "B", "S"), + ("B8", "F8", "B", "S"), # in/out are both R8 + ("F8", "F8", "F8", "S"), + ("B8", "B8", "B8", "S"), + ("F8", "B8", "B8", "S"), + ("B8", "F8", "B8", "S"), + ("F8", "F8", "B8", "S"), + ("B8", "B8", "F8", "S"), + ("F8", "B8", "F8", "S"), + ("B8", "F8", "F8", "S"), # F8 NANOO + ("F8N", "F8N", "S", "S"), + ("B8N", "B8N", "S", "S"), + ("F8N", "B8N", "S", "S"), + ("B8N", "F8N", "S", "S"), + ("F8N", "F8N", "H", "S"), + ("B8N", "B8N", "H", "S"), + ("F8N", "B8N", "H", "S"), + ("B8N", "F8N", "H", "S"), + ("B8N", "B8N", "B", "S"), + ("H", "H", "F8N", "S"), + ("F8N", "F8N", "B", "S"), + ("F8N", "B8N", "B", "S"), + ("B8N", "F8N", "B", "S"), # in/out are both R8 + ("F8N", "F8N", "F8N", "S"), + ("B8N", "B8N", "B8N", "S"), + ("F8N", "B8N", "B8N", "S"), + ("B8N", "F8N", "B8N", "S"), + ("F8N", "F8N", "B8N", "S"), + ("B8N", "B8N", "F8N", "S"), + ("F8N", "B8N", "F8N", "S"), + ("B8N", "F8N", "F8N", "S"), + ("F6", "F6", "S", "S"), + ("B6", "B6", "S", "S"), + ("F6", "B6", "S", "S"), + ("B6", "F6", "S", "S"), + ("F8", "F6", "S", "S"), + ("F6", "F8", "S", "S"), + ("F8", "F4", "S", "S"), + ("F4", "F8", "S", "S"), + ("F6", "F4", "S", "S"), + ("F4", "F6", "S", "S"), + ("B6", "F4", "S", "S"), + ("F4", "B6", "S", "S"), + ("F8", "B6", "S", "S"), + ("B6", "F8", "S", "S"), + ("F6", "B8", "S", "S"), + ("B8", "F6", "S", "S"), + ("F4", "B8", "S", "S"), + ("B8", "F4", "S", "S"), + ("F4", "F4", "S", "S"), + ("F4", "F4", "H", "S"), + ("F4", "F4", "B", "S"), ] @@ -632,58 +599,71 @@ def getRealDataTypeB(dataType): # *_TiToTc_BH*.yaml where Ti, To, and Tc are the data types of A/B, C/D, and computation, respectively. # The name of the library logic files for non-HPA (HPA=F) types is: *_TiB*.yaml. _HPATypes = [ - ("H", "S", "S"), - ("H", "H", "S"), - ("B", "B", "S"), - ("B", "S", "S"), - ("B", "H", "S"), - ("I8", "I", "I"), - ("4xi8", "I", "I"), - ("I8", "I", "S"), - ("I8", "I8", "S"), - ("I8", "H", "S"), - ("I8", "B", "S"), - ("F8", "S", "S"), - ("B8", "S", "S"), - ("F8B8", "S", "S"), - ("B8F8", "S", "S"), - ("F8", "H", "S"), - ("B8", "H", "S"), - ("F8B8", "H", "S"), - ("B8F8", "H", "S"), - ("H", "F8", "S"), - ("F8", "B", "S"), - ("F8B8", "B", "S"), # in/out are both R8 - ("F8", "F8", "S"), - ("B8", "B8", "S"), - ("F8B8", "B8", "S"), - ("B8F8", "B8", "S"), - ("F8", "B8", "S"), - ("B8", "F8", "S"), - ("F8B8", "F8", "S"), - ("B8F8", "F8", "S"), - ("F8N", "S", "S"), - ("B8N", "S", "S"), - ("F8B8N", "S", "S"), - ("B8F8N", "S", "S"), - ("F8N", "H", "S"), - ("B8N", "H", "S"), - ("F8B8N", "H", "S"), - ("B8F8N", "H", "S"), - ("H", "F8N", "S"), - ("F8N", "B", "S"), - ("F8B8N", "B", "S"), # in/out are both R8 - ("F8N", "F8N", "S"), - ("B8N", "B8N", "S"), - ("F8B8N", "B8N", "S"), - ("B8F8N", "B8N", "S"), - ("F8N", "B8N", "S"), - ("B8N", "F8N", "S"), - ("F8B8N", "F8N", "S"), - ("B8F8N", "F8N", "S"), - ("F4", "S", "S"), - ("F6", "S", "S"), - ("B6", "S", "S"), + ("H", "H", "S", "S"), + ("H", "H", "H", "S"), + ("B", "B", "B", "S"), + ("B", "B", "S", "S"), + ("B", "B", "H", "S"), + ("I8", "I8", "I", "I"), + ("4xi8", "4xi8", "I", "I"), + ("I8", "I8", "I", "S"), + ("I8", "I8", "I8", "S"), + ("I8", "I8", "H", "S"), + ("I8", "I8", "B", "S"), + ("F8", "F8", "S", "S"), + ("B8", "B8", "S", "S"), + ("F8", "B8", "S", "S"), + ("B8", "F8", "S", "S"), + ("F8", "F8", "H", "S"), + ("B8", "B8", "H", "S"), + ("F8", "B8", "H", "S"), + ("B8", "F8", "H", "S"), + ("H", "H", "F8", "S"), + ("F8", "F8", "B", "S"), + ("F8", "B8", "B", "S"), # in/out are both R8 + ("F8", "F8", "F8", "S"), + ("B8", "B8", "B8", "S"), + ("F8", "B8", "B8", "S"), + ("B8", "F8", "B8", "S"), + ("F8", "F8", "B8", "S"), + ("B8", "B8", "F8", "S"), + ("F8", "B8", "F8", "S"), + ("B8", "F8", "F8", "S"), + ("F8N", "F8N", "S", "S"), + ("B8N", "B8N", "S", "S"), + ("F8N", "B8N", "S", "S"), + ("B8N", "F8N", "S", "S"), + ("F8N", "F8N", "H", "S"), + ("B8N", "B8N", "H", "S"), + ("F8N", "B8N", "H", "S"), + ("B8N", "F8N", "H", "S"), + ("H", "H", "F8N", "S"), + ("F8N", "F8N", "B", "S"), + + ("F8N", "B8N", "B", "S"), # in/out are both R8 + ("F8N", "F8N", "F8N", "S"), + ("B8N", "B8N", "B8N", "S"), + ("F8N", "B8N", "B8N", "S"), + ("B8N", "F8N", "B8N", "S"), + ("F8N", "F8N", "B8N", "S"), + ("B8N", "B8N", "F8N", "S"), + ("F8N", "B8N", "F8N", "S"), + ("B8N", "F8N", "F8N", "S"), + ("F6", "F6", "S", "S"), + ("B6", "B6", "S", "S"), + ("F6", "B6", "S", "S"), + ("B6", "F6", "S", "S"), + ("F8", "F6", "S", "S"), + ("F6", "F8", "S", "S"), + ("F8", "F4", "S", "S"), + ("F4", "F8", "S", "S"), + ("F6", "F4", "S", "S"), + ("F4", "F6", "S", "S"), + ("B6", "F4", "S", "S"), + ("F4", "B6", "S", "S"), + ("F4", "F4", "S", "S"), + ("F4", "F4", "H", "S"), + ("F4", "F4", "B", "S"), ] def problemTypeToEnum(problemType): @@ -955,27 +935,13 @@ def __init__(self, config, printIndexAssignmentInfo: bool): # See the discussion in ValidParameters.py for validGEMMTypes ################################################################################ def _checkIfSupportedGEMMType(self): - # Here we use "DataType" instead of "MacDataTypeA(B)" for validation. It is totally fine cause we passed "MacDataTypeA(B)" into Client side. - # Ex: MacDataTypeA: b6, MacDataTypeB: f4 -> we can either choose "DataType: f4" or "DataType: b6" - inType = self["DataType"] + inTypeA = self["MacDataTypeA"] + inTypeB = self["MacDataTypeB"] outType = self["DestDataType"] computeType = self["ComputeDataType"] - gemmType = ( inType.toChar(), outType.toChar(), computeType.toChar() ) - if self["MXBlockA"] or self["MXBlockB"]: - if gemmType not in _validMXGEMMTypes: - raise Exception("This typed-MX-GEMM (Ti, To, Tc) = (%s, %s, %s) is not supported yet." % (gemmType[0], gemmType[1], gemmType[2])) - if self["MXBlockA"] == 0: - if self["MXBlockB"] not in _validMXGEMMBlock: - raise Exception("MXShape is not supported") - elif self["MXBlockB"] == 0: - if self["MXBlockA"] not in _validMXGEMMBlock: - raise Exception("MXShape is not supported") - elif (self["MXBlockA"] != self["MXBlockB"]): - raise Exception("MXShape is not supported") - elif (self["MXBlockA"] not in _validMXGEMMBlock): - raise Exception("MXShape is not supported") - elif gemmType not in _validGEMMTypes: + gemmType = ( inTypeA.toChar(), inTypeB.toChar(), outType.toChar(), computeType.toChar() ) + if gemmType not in _validGEMMTypes: raise Exception("This typed-GEMM (Ti, To, Tc) = (%s, %s, %s) is not supported yet."%(gemmType[0], gemmType[1], gemmType[2])) ######################################## @@ -1192,7 +1158,8 @@ def __str__(self): # Special condition for some newly supported kernels: # HHS, HSS, BSS and I8II kernels, use a clearer naming _TiToTc_ # TODO: Distinguish all kernels by _TiToTc_ to be more consistent with rocblas - gemmType = (self["DataType"].toChar(),self["DestDataType"].toChar(),self["ComputeDataType"].toChar() ) + gemmType = (self["MacDataTypeA"].toChar(), self["MacDataTypeB"].toChar(), + self["DestDataType"].toChar(), self["ComputeDataType"].toChar()) if gemmType in _HPATypes: name[-1] += "".join([self["DestDataType"].toChar(), self["ComputeDataType"].toChar()]) diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py index 0788faf7af5..06b821bad60 100644 --- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py +++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py @@ -554,7 +554,8 @@ def assignProblemIndependentDerivedParameters(state, printRejectionReason: bool, # set ASEM=minASEMforMX for not TLUA or not TLUB # so far, kernel code can support 16, but host code cannot hanlde it # TODO: enable 16 (or less) - minASEMforMX = 32 + # TODO: enable less than 256 for Subtile + minASEMforMX = 32 if not state["UseSubtileImpl"] else 256 if (state["ProblemType"]["MXBlockA"] or state["ProblemType"]["MXBlockB"]) and \ ((not state["ProblemType"]["TLUA"]) or (not state["ProblemType"]["TLUB"])): if state["AssertSummationElementMultiple"] % minASEMforMX != 0: @@ -664,6 +665,55 @@ def assignProblemIndependentDerivedParameters(state, printRejectionReason: bool, state["UseMFMAF32XEmulation"] = True # MFMA version for gfx950 etc. state["MfmaInitCVgprs"] = False + # Only enable UseSubtileImpl on gfx950; ignore user request on other ISAs. + isgfx950 = state["ISA"] == IsaVersion(9,5,0) + state["UseSubtileImpl"] = state["UseSubtileImpl"] and isgfx950 + + if isgfx950 and (state["ProblemType"]["MXBlockA"] or state["ProblemType"]["MXBlockB"]) and not state["UseSubtileImpl"]: + reject(state, printRejectionReason, "gfx950 MX requires UseSubtileImpl") + + if state["UseSubtileImpl"]: + state["VectorWidthA"] = 1 + state["VectorWidthB"] = 1 + state["SourceSwap"] = False + # Force BufferStore=1: UseSubtileImpl optimized storeD path is only implemented + # for buffer stores for now. + state["BufferStore"] = 1 + # Not currently implemented in subtile implementation + state["Use64bShadowLimit"] = False + state["Use64bShadowLimitMX"] = False + + # DepthU should be multiple of 2 * MIK. DepthU=-1 case, set DepthU=2*MIK*LSU + duUnit = 2 * state["MatrixInstK"] * state["LocalSplitU"] + if state["DepthU"] == -1: + state["DepthU"] = duUnit + if state["DepthU"] % duUnit != 0: + reject(state, printRejectionReason, "UseSubtileImpl=1 support only DepthU multiple of 2 * MatrixInstK * LocalSplitU") + + bytesLoaded = state["NumThreads"] * 16 + if state["ProblemType"]["MXBlockA"]: + numBytesMXSA = (state["DepthU"] // state["ProblemType"]["MXBlockA"]) * state["MacroTile0"] + if bytesLoaded < numBytesMXSA: + reject(state, printRejectionReason, "Unable to load MXSA scales using one load per wave") + if state["ProblemType"]["MXBlockB"]: + numBytesMXSB = (state["DepthU"] // state["ProblemType"]["MXBlockB"]) * state["MacroTile1"] + if bytesLoaded < numBytesMXSB: + reject(state, printRejectionReason, "Unable to load MXSB scales using one load per wave") + + for tc in ("MXSA", "MXSB"): + if state["ProblemType"]["MXBlock" + tc[-1]]: + for field in ("LSC", "LSP", "NumLoadsCoalesced", "NumLoadsPerpendicular", "NumTotalPackedLoads"): + state["%s%s" % (field, tc)] = 0 + + if state["PrefetchGlobalRead"] not in [0, 2]: + reject(state, printRejectionReason, + "UseSubtileImpl=1 requires PrefetchGlobalRead 0 or 2, got %d" % state["PrefetchGlobalRead"]) + if not (state["MatrixInstM"] == 16 and state["MatrixInstN"] == 16): + reject(state, printRejectionReason, "UseSubtileImpl=1 requires MatrixInst 16x16") + if state["ScheduleIterAlg"] == 1 or state["ScheduleIterAlg"] == 2: + reject(state, printRejectionReason, "UseSubtileImpl=1 does not support ScheduleIterAlg") + if state["StreamK"] == 0: + reject(state, printRejectionReason, "UseSubtileImpl=1 supports StreamK only (no support for GSU)") # done state["AssignedProblemIndependentDerivedParameters"] = True @@ -679,7 +729,8 @@ def assignProblemIndependentDerivedParameters(state, printRejectionReason: bool, @staticmethod def setGlobalReadVectorWidth(state, tc, totalVectors, grvw, printRejectionReason: bool): validDepthU = True - if grvw not in [1,2,4,8,16,32]: + # Skip GRVW range check for subtile impl: scale uses serial*loadWidth DTL addressing, not standard GRVW chunks + if grvw not in [1,2,4,8,16,32] and not state["UseSubtileImpl"]: validDepthU = False if totalVectors % state["NumThreads"] != 0: reject(None, printRejectionReason, "totalVectors%s %u %% NumThreads %u != 0" \ @@ -1032,6 +1083,9 @@ def isDirectToVgprDoable(state, tc, printRejectionReason: bool, isaInfoMap: Dict def isDirectToLdsDoable(state, tc, isaInfoMap, printRejectionReason: bool): isa = state["ISA"] + if state["UseSubtileImpl"]: + return True + # x4 support for directToLds canDTLx4 = isaInfoMap[isa].asmCaps["HasDirectToLdsx4"] @@ -1367,6 +1421,12 @@ def assignDerivedParameters( if not state["MIWaveTile"] or len(state["MIWaveTile"]) != 2: reject(state, printRejectionReason, "invalid MIWaveTile") return + if state["UseSubtileImpl"] and (state["ProblemType"]["MXBlockA"] or state["ProblemType"]["MXBlockB"]): + if state["MIWaveTile"][0] % 2 != 0 or state["MIWaveTile"][1] % 2 != 0: + reject(state, printRejectionReason, + "UseSubtileImpl=1 with MX datatype requires even MIWaveTile, got [%d, %d]" + % (state["MIWaveTile"][0], state["MIWaveTile"][1])) + return if isaInfoMap[isa].asmCaps["HasMFMA"]: if not state["ProblemType"]["HighPrecisionAccumulate"] \ and state["ProblemType"]["DataType"].numRegisters() < 1 \ @@ -1954,6 +2014,8 @@ def _applySubIterSetting(enable): for key, value in state.items(): if isinstance(value, int) and value < 0: backupValues.append([key, value]) + # Skip this check for subtile impl? + # TODO: Add this check back while True: for backup in backupValues: state[backup[0]] = backup[1] @@ -2002,6 +2064,9 @@ def _applySubIterSetting(enable): if state["TailloopInNll"] and state["UseCustomMainLoopSchedule"] == 1: reject(state, printRejectionReason, "UseCustomMainLoopSchedule=1 is incompatible with TailloopInNll=True") return + # UseSubtileImpl has its own main loop scheduler; CMS is not compatible. + if state["UseSubtileImpl"] and state["UseCustomMainLoopSchedule"] == 1: + reject(state, printRejectionReason, "UseCustomMainLoopSchedule=1 is incompatible with UseSubtileImpl") # additional setting for non CMS if state["UseCustomMainLoopSchedule"] == 0: @@ -2243,7 +2308,10 @@ def calcLdsPad(isaInfoMap: Dict[str, IsaInfo]) -> int: readRegsA //= 2 if (not isaInfoMap[isa].asmCaps['HasWMMA']) and (readRegsA > 6 or readRegsB > 6): reject(state, "LocalReadVectorWidth results in attemping to read LDS larger than b192, reject") - return ldsPadA, ldsPadB, ldsPadM + return ldsPadA, ldsPadB, ldsPadM, 0, 0 + # SubtileImpl does not need LDS padding. + if state["UseSubtileImpl"]: + return 0, 0, 0, 0, 0 if state["EnableMatrixInstruction"]: # for readRegs = 1 or 4, we need to double pad for MI16x16xNx1 to avoid bank conflict. if state["MatrixInstB"] == 1 and state["MatrixInstM"] == 16: @@ -2252,7 +2320,7 @@ def calcLdsPad(isaInfoMap: Dict[str, IsaInfo]) -> int: if readRegsB == 4 or readRegsB == 1: optPadB *= 2 if ldsPadA == -1: - if isMX and state["ProblemType"]["DataTypeA"].is6bitFloat(): + if isMX and (state["ProblemType"]["DataTypeA"].is6bitFloat() or state["ProblemType"]["DataTypeA"].isFloat4()): ldsPadA = 0 else: if not state["UnrollMajorLDSA"]: @@ -2285,7 +2353,7 @@ def calcLdsPad(isaInfoMap: Dict[str, IsaInfo]) -> int: assert(ldsPadA >= 0) if ldsPadB == -1: - if isMX and state["ProblemType"]["DataTypeB"].is6bitFloat(): + if isMX and (state["ProblemType"]["DataTypeB"].is6bitFloat() or state["ProblemType"]["DataTypeB"].isFloat4()): ldsPadB = 0 else: if not state["UnrollMajorLDSB"]: @@ -2961,7 +3029,7 @@ def calSwizzlePackK(state, tc): totalVectorsCoalescedA = totalElementsCoalescedA // GlobalReadVectorWidthA # handle global read vector width MXSA - if state["ProblemType"]["MXBlockA"]: + if state["ProblemType"]["MXBlockA"] and not state["UseSubtileImpl"]: if state["ProblemType"]["TLUMXSA"]: # NT/NN totalElementsCoalescedMXSA = state["MacroTileMXSA"] totalElementsPerpMXSA = state["_DepthUMXSA"] @@ -3037,7 +3105,7 @@ def calSwizzlePackK(state, tc): totalVectorsCoalescedB = totalElementsCoalescedB // GlobalReadVectorWidthB # handle global read vector width MXSB - if state["ProblemType"]["MXBlockB"]: + if state["ProblemType"]["MXBlockB"] and not state["UseSubtileImpl"]: if state["ProblemType"]["TLUMXSB"]: # NT/NN totalElementsCoalescedMXSB = state["MacroTileMXSB"] totalElementsPerpMXSB = state["_DepthUMXSB"] @@ -3191,7 +3259,7 @@ def calSwizzlePackK(state, tc): else: state["StoreVectorWidth"] = state["VectorWidthA"] - if state["EnableMatrixInstruction"]: + if state["EnableMatrixInstruction"] and not state["UseSubtileImpl"]: if state["SourceSwap"]: if ((state["VectorWidthA"] % state["StoreVectorWidth"]) != 0): reject(state, printRejectionReason, "MFMA SourceSwap mode doesn't support vwA(%u) with svw(%u)" % (state["VectorWidthA"], state["StoreVectorWidth"])) @@ -3304,19 +3372,14 @@ def calSwizzlePackK(state, tc): if not Solution.setGlobalLoadTileDimClassic(state, "A", state["NumLoadsA"], \ totalVectorsCoalescedA, totalElementsPerpA, state["_DepthUA"], printRejectionReason): return - if state["ProblemType"]["MXBlockA"]: + if state["ProblemType"]["MXBlockA"] and not state["UseSubtileImpl"]: if not Solution.setGlobalLoadTileDimClassic(state, "MXSA", state["NumLoadsMXSA"], \ totalVectorsCoalescedMXSA, totalElementsPerpMXSA, state["_DepthUMXSA"], printRejectionReason): return if not Solution.setGlobalLoadTileDimClassic(state, "B", state["NumLoadsB"], \ totalVectorsCoalescedB, totalElementsPerpB, state["_DepthUB"], printRejectionReason): return - if state["ProblemType"]["MXBlockB"]: - if not Solution.setGlobalLoadTileDimClassic(state, "MXSB", state["NumLoadsMXSB"], \ - totalVectorsCoalescedMXSB, totalElementsPerpMXSB, state["_DepthUMXSB"], printRejectionReason): - return - - if state["ProblemType"]["MXBlockB"]: + if state["ProblemType"]["MXBlockB"] and not state["UseSubtileImpl"]: if not Solution.setGlobalLoadTileDimClassic(state, "MXSB", state["NumLoadsMXSB"], \ totalVectorsCoalescedMXSB, totalElementsPerpMXSB, state["_DepthUMXSB"], printRejectionReason): return @@ -3517,6 +3580,8 @@ def calSwizzlePackK(state, tc): # No longer support loadX2/loadx4 . for tc in ['A', 'B']: tcmx = "MXS%s"%tc + if state["UseSubtileImpl"] and state["ProblemType"]["MXBlock%s"%tc]: + state["DirectToLds%s"%tcmx] = False if state["DirectToLds%s"%tc]: isDtlDoable = Solution.isDirectToLdsDoable(state, tc, isaInfoMap, printRejectionReason) if (not state["DirectToVgpr%s"%tc]) and isDtlDoable: @@ -3549,7 +3614,7 @@ def calSwizzlePackK(state, tc): #1LDS buffer must be 0 for DirectToLdsA state["1LDSBuffer"] = 0 # MX case - if state["ProblemType"]["MXBlockA"] or state["ProblemType"]["MXBlockB"]: + if (state["ProblemType"]["MXBlockA"] or state["ProblemType"]["MXBlockB"]): if state["DirectToLdsA"] != state["DirectToLdsMXSA"] or state["DirectToLdsB"] != state["DirectToLdsMXSB"]: reject(state, printRejectionReason, "DirectToLdsA/B and DirectToLdsMXSA/B should match") if state["DirectToLdsA"] != state["DirectToLdsB"]: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_bf16_tn_act.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_bf16_tn_act.yaml index 211830a63f4..868231ec600 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_bf16_tn_act.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_bf16_tn_act.yaml @@ -1,5 +1,6 @@ TestParameters: - marks: [xfail-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 + # need to skip for gfx950: TODO: re-enable this with subtile + MXScaleFormat=1 + marks: [skip-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 GlobalParameters: NumElementsToValidate: -1 @@ -16,6 +17,7 @@ GlobalParameters: DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -52,7 +54,8 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] - [32, 32, 64, 1, 1, 4, 4, 2, 2] - [32, 32, 64, 1, 1, 8, 8, 2, 2] - - DepthU: [32, 64, 128] + #- DepthU: [32, 64, 128] + - DepthU: [256, 512] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - PrefetchGlobalRead: [2] @@ -75,6 +78,8 @@ BenchmarkProblems: - LdsPadA: [4] - LdsPadB: [4] - WorkGroupMapping: [64] + - StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_bf16_tn_act_groupgemm.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_bf16_tn_act_groupgemm.yaml index de344424ac4..6b824378fee 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_bf16_tn_act_groupgemm.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_bf16_tn_act_groupgemm.yaml @@ -1,5 +1,6 @@ TestParameters: - marks: [xfail-gfx950, xfail-gfx1250, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx940, skip-gfx941] # Only for gfx950 + # need to skip for gfx950: TODO: re-enable this with subtile + MXScaleFormat=1 + marks: [skip-gfx950, skip-gfx1250, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx940, skip-gfx941] # Only for gfx950 GlobalParameters: NumElementsToValidate: -1 @@ -16,6 +17,7 @@ GlobalParameters: DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -53,7 +55,8 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] - [32, 32, 64, 1, 1, 4, 4, 2, 2] - [32, 32, 64, 1, 1, 8, 8, 2, 2] - - DepthU: [32, 64, 128] + #- DepthU: [32, 64, 128] + - DepthU: [256, 512] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - PrefetchGlobalRead: [2] @@ -76,6 +79,8 @@ BenchmarkProblems: - LdsPadA: [4] - LdsPadB: [4] - WorkGroupMapping: [64] + - StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_fp32_tn_act.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_fp32_tn_act.yaml index 048e28e2cc5..5ee8ef3e310 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_fp32_tn_act.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_fp32_tn_act.yaml @@ -1,5 +1,6 @@ TestParameters: - marks: [xfail-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 + # need to skip for gfx950: TODO: re-enable this with subtile + MXScaleFormat=1 + marks: [skip-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 GlobalParameters: NumElementsToValidate: -1 @@ -16,6 +17,7 @@ GlobalParameters: DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -52,7 +54,8 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] - [32, 32, 64, 1, 1, 4, 4, 2, 2] - [32, 32, 64, 1, 1, 8, 8, 2, 2] - - DepthU: [32, 64, 128] + #- DepthU: [32, 64, 128] + - DepthU: [256, 512] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - PrefetchGlobalRead: [2] @@ -75,6 +78,8 @@ BenchmarkProblems: - LdsPadA: [4] - LdsPadB: [4] - WorkGroupMapping: [64] + - StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_fp32_tn_act_groupgemm.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_fp32_tn_act_groupgemm.yaml index 7a6ca30a1b4..d067e1f6148 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_fp32_tn_act_groupgemm.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/fp8_mxfp4_fp32_tn_act_groupgemm.yaml @@ -1,5 +1,6 @@ TestParameters: - marks: [xfail-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 + # need to skip for gfx950: TODO: re-enable this with subtile + MXScaleFormat=1 + marks: [skip-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 GlobalParameters: NumElementsToValidate: -1 @@ -16,6 +17,7 @@ GlobalParameters: DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -53,7 +55,8 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] - [32, 32, 64, 1, 1, 4, 4, 2, 2] - [32, 32, 64, 1, 1, 8, 8, 2, 2] - - DepthU: [32, 64, 128] + #- DepthU: [32, 64, 128] + - DepthU: [256, 512] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - PrefetchGlobalRead: [2] @@ -76,6 +79,8 @@ BenchmarkProblems: - LdsPadA: [4] - LdsPadB: [4] - WorkGroupMapping: [64] + - StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mx32f4_tn.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mx32f4_tn.yaml index a0dc563ef3a..72ad5a8fb2a 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mx32f4_tn.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mx32f4_tn.yaml @@ -28,6 +28,7 @@ GlobalParameters: # PrintTensorC: True # PrintTensorD: True # PrintTensorRef: True + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -103,6 +104,8 @@ BenchmarkProblems: # - NumElementsPerBatchStore: [0] # - StorePriorityOpt: [0] # - StaggerU: [0] + - StreamK: [0,3] + - UseSubtileImpl: [True] BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: @@ -165,6 +168,7 @@ BenchmarkProblems: - DirectToLds: [0,1,2,3] - Use64bShadowLimitMX: [0,1] - StreamK: [0,3] + - UseSubtileImpl: [True] BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mx32f8_tn.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mx32f8_tn.yaml index d246f36cdb9..72c30e469c4 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mx32f8_tn.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mx32f8_tn.yaml @@ -1,5 +1,6 @@ TestParameters: - marks: [skip-gfx1250, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] + # need to skip for gfx950: TODO: re-enable this with subtile + MXScaleFormat=1 + marks: [skip-gfx950, skip-gfx1250, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] GlobalParameters: NumElementsToValidate: -1 @@ -29,6 +30,7 @@ GlobalParameters: # PrintTensorC: True # PrintTensorD: True # PrintTensorRef: True + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -102,6 +104,7 @@ BenchmarkProblems: - StoreVectorWidth: [-1] - SourceSwap: [1] - StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile # - NumElementsPerBatchStore: [0] # - StorePriorityOpt: [0] # - StaggerU: [0] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_bf16_tn_act.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_bf16_tn_act.yaml index afe643a7f74..eb88fcf24ad 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_bf16_tn_act.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_bf16_tn_act.yaml @@ -1,5 +1,6 @@ TestParameters: - marks: [xfail-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 + # need to skip for gfx950: TODO: re-enable this with subtile + MXScaleFormat=1 + marks: [skip-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 GlobalParameters: NumElementsToValidate: -1 @@ -16,6 +17,7 @@ GlobalParameters: DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -52,7 +54,8 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] - [32, 32, 64, 1, 1, 4, 4, 2, 2] - [32, 32, 64, 1, 1, 8, 8, 2, 2] - - DepthU: [32, 64, 128] + #- DepthU: [32, 64, 128] + - DepthU: [256, 512] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - PrefetchGlobalRead: [2] @@ -75,6 +78,8 @@ BenchmarkProblems: - LdsPadA: [4] - LdsPadB: [4] - WorkGroupMapping: [64] + - StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_bf16_tn_act_groupgemm.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_bf16_tn_act_groupgemm.yaml index 158960fec55..a1d2e1d9c0a 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_bf16_tn_act_groupgemm.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_bf16_tn_act_groupgemm.yaml @@ -1,5 +1,6 @@ TestParameters: - marks: [xfail-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250,skip-gfx940, skip-gfx941] # Only for gfx950 + # need to skip for gfx950: TODO: re-enable this with subtile + MXScaleFormat=1 + marks: [skip-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250,skip-gfx940, skip-gfx941] # Only for gfx950 GlobalParameters: NumElementsToValidate: -1 @@ -16,6 +17,7 @@ GlobalParameters: DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -53,7 +55,8 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] - [32, 32, 64, 1, 1, 4, 4, 2, 2] - [32, 32, 64, 1, 1, 8, 8, 2, 2] - - DepthU: [32, 64, 128] + #- DepthU: [32, 64, 128] + - DepthU: [256, 512] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - PrefetchGlobalRead: [2] @@ -76,6 +79,8 @@ BenchmarkProblems: - LdsPadA: [4] - LdsPadB: [4] - WorkGroupMapping: [64] + - StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_fp32_tn_act.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_fp32_tn_act.yaml index 93d6a3467a9..e9f448feb43 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_fp32_tn_act.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_fp32_tn_act.yaml @@ -1,5 +1,6 @@ TestParameters: - marks: [xfail-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 + # need to skip for gfx950: TODO: re-enable this with subtile + MXScaleFormat=1 + marks: [skip-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 GlobalParameters: NumElementsToValidate: -1 @@ -16,6 +17,7 @@ GlobalParameters: DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -52,7 +54,8 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] - [32, 32, 64, 1, 1, 4, 4, 2, 2] - [32, 32, 64, 1, 1, 8, 8, 2, 2] - - DepthU: [32, 64, 128] + #- DepthU: [32, 64, 128] + - DepthU: [256, 512] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - PrefetchGlobalRead: [2] @@ -75,6 +78,8 @@ BenchmarkProblems: - LdsPadA: [4] - LdsPadB: [4] - WorkGroupMapping: [64] + - StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_fp32_tn_act_groupgemm.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_fp32_tn_act_groupgemm.yaml index d8dd486672e..2b997138e0e 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_fp32_tn_act_groupgemm.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_fp8_fp32_tn_act_groupgemm.yaml @@ -1,5 +1,6 @@ TestParameters: - marks: [xfail-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 + # need to skip for gfx950: TODO: re-enable this with subtile + MXScaleFormat=1 + marks: [skip-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 GlobalParameters: NumElementsToValidate: -1 @@ -16,6 +17,7 @@ GlobalParameters: DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -53,7 +55,8 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] - [32, 32, 64, 1, 1, 4, 4, 2, 2] - [32, 32, 64, 1, 1, 8, 8, 2, 2] - - DepthU: [32, 64, 128] + #- DepthU: [32, 64, 128] + - DepthU: [256, 512] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - PrefetchGlobalRead: [2] @@ -76,6 +79,8 @@ BenchmarkProblems: - LdsPadA: [4] - LdsPadB: [4] - WorkGroupMapping: [64] + - StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_bf16_tn_act.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_bf16_tn_act.yaml index db0024a6a77..b61027ad3cd 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_bf16_tn_act.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_bf16_tn_act.yaml @@ -16,6 +16,7 @@ GlobalParameters: DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -51,7 +52,8 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] - [32, 32, 64, 1, 1, 4, 4, 2, 2] - [32, 32, 64, 1, 1, 8, 8, 2, 2] - - DepthU: [32, 64, 128] + #- DepthU: [32, 64, 128] + - DepthU: [256, 512] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - PrefetchGlobalRead: [2] @@ -74,6 +76,8 @@ BenchmarkProblems: - LdsPadA: [4] - LdsPadB: [4] - WorkGroupMapping: [64] + - StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_bf16_tn_act_groupgemm.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_bf16_tn_act_groupgemm.yaml index 56e2900ca82..60e036539af 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_bf16_tn_act_groupgemm.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_bf16_tn_act_groupgemm.yaml @@ -1,5 +1,6 @@ TestParameters: - marks: [xfail-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 + # need to skip for gfx950: TODO: re-enable this with subtile + MXScaleFormat=1 + marks: [skip-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250, skip-gfx940, skip-gfx941] # Only for gfx950 GlobalParameters: NumElementsToValidate: -1 @@ -16,6 +17,7 @@ GlobalParameters: DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -52,7 +54,8 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] - [32, 32, 64, 1, 1, 4, 4, 2, 2] - [32, 32, 64, 1, 1, 8, 8, 2, 2] - - DepthU: [32, 64, 128] + #- DepthU: [32, 64, 128] + - DepthU: [256, 512] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - PrefetchGlobalRead: [2] @@ -75,6 +78,8 @@ BenchmarkProblems: - LdsPadA: [4] - LdsPadB: [4] - WorkGroupMapping: [64] + #- StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: @@ -101,12 +106,6 @@ BenchmarkProblems: - Exact: [63, 63, 1, 64] - Exact: [255, 255, 1, 256] - ######################################## - # 5. Odd K - to test tail loop - ######################################## - - Exact: [64, 64, 1, 63] - - Exact: [256, 256, 1, 255] - ######################################## # 6. Small size with batch > 1 ######################################## @@ -117,7 +116,6 @@ BenchmarkProblems: ######################################## - Exact: [1000, 1000, 1, 256] - Exact: [1500, 1500, 1, 512] - - Exact: [1024, 1024, 1, 333] - BiasTypeArgs: ['s'] - ActivationArgs: - [Enum: none] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_fp32_tn_act.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_fp32_tn_act.yaml index cde77b36a14..c3937e3003a 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_fp32_tn_act.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_fp32_tn_act.yaml @@ -17,6 +17,7 @@ GlobalParameters: DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -75,6 +76,8 @@ BenchmarkProblems: - LdsPadA: [4] - LdsPadB: [4] - WorkGroupMapping: [64] + - StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: @@ -111,6 +114,7 @@ BenchmarkProblems: # 6. Small size with batch > 1 ######################################## - Exact: [32, 32, 8, 32] + - Exact: [32, 32, 8, 256] ######################################## # 7. Medium size that doesn't divide evenly on CUs (stream-k) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_fp32_tn_act_groupgemm.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_fp32_tn_act_groupgemm.yaml index d638c6880c0..65b2e2cbbe9 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_fp32_tn_act_groupgemm.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/mxfp4_mxfp4_fp32_tn_act_groupgemm.yaml @@ -1,5 +1,6 @@ TestParameters: - marks: [skip-gfx1250, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx940, skip-gfx941] # Only for gfx950 + # need to skip for gfx950: TODO: re-enable this with subtile + MXScaleFormat=1 + marks: [skip-gfx950, skip-gfx1250, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx940, skip-gfx941] # Only for gfx950 GlobalParameters: NumElementsToValidate: -1 @@ -16,6 +17,7 @@ GlobalParameters: DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -52,7 +54,8 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] - [32, 32, 64, 1, 1, 4, 4, 2, 2] - [32, 32, 64, 1, 1, 8, 8, 2, 2] - - DepthU: [32, 64, 128] + #- DepthU: [32, 64, 128] + - DepthU: [256, 512] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - PrefetchGlobalRead: [2] @@ -75,6 +78,8 @@ BenchmarkProblems: - LdsPadA: [4] - LdsPadB: [4] - WorkGroupMapping: [64] + #- StreamK: [0,3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: @@ -111,6 +116,7 @@ BenchmarkProblems: # 6. Small size with batch > 1 ######################################## - Exact: [32, 32, 8, 32] + - Exact: [32, 32, 8, 256] ######################################## # 7. Medium size that doesn't divide evenly on CUs (stream-k) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_bf16.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_bf16.yaml new file mode 100644 index 00000000000..d87489af98b --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_bf16.yaml @@ -0,0 +1,421 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + MinimumRequiredVersion: 5.0.0 + PrintLevel: 3 + Device: 0 + CMakeBuildType: Release + MergeFiles: False + KernelTime: True + MaxWorkspaceSize: 13421772800 + DataInitTypeA: 3 + DataInitTypeB: 3 + DataInitTypeC: 3 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 2 + DataInitTypeBias: 1 + DataInitTypeMXSA: 3 + DataInitTypeMXSB: 3 + MXScaleFormat: 1 + BoundsCheck: 0 + KeepBuildTmp: True + DeviceLDS: 163840 + MaxLDS: 163840 + #PrintSolutionRejectionReason: True + #GenerateSourcesAndExit: False + CpuThreads: 1 + RotatingBufferSize: 0 + +BenchmarkProblems: + ######################################## + # BF16 TN — no bias, DU = 2*MI_K = 64 + # + # Full wave tile coverage: + # - Symmetric (2x2, 4x4, 8x8 wavetile) + # - Asymmetric (6x4, 2x6, 8x6 etc.) + # - Different wave groups (1x1, 2x2, 4x1, 1x4) + # - Odd wave tile sizes (3x4, 6x2, 3x2, 2x3, 2x9, 9x2) + # + # Problem sizes: K is always a multiple of DU=64. + # Covers full-tile (M,N mult of macro tile) and edge cases + # (M,N mult of 32 but NOT of macro tile). + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + ActivationFuncCall: True + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + # 1x1 wave group + - [16, 16, 32, 1, 1, 2, 2, 1, 1] # MT 32x32 + - [16, 16, 32, 1, 1, 4, 2, 1, 1] # MT 64x32 + # 2x2 wave group — symmetric + - [16, 16, 32, 1, 1, 1, 1, 2, 2] # MT 32x32 + - [16, 16, 32, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 32, 1, 1, 8, 8, 2, 2] # MT 256x256 + - [16, 16, 32, 1, 1, 12, 8, 2, 2] # MT 384x256 (large asymmetric) + - [16, 16, 32, 1, 1, 10, 10, 2, 2] # MT 320x320 (large symmetric) + - [16, 16, 32, 1, 1, 8, 12, 2, 2] # MT 256x384 (large asymmetric) + # 2x2 wave group — large tiles at/near LDS limit + - [16, 16, 32, 1, 1, 14, 6, 2, 2] # MT 448x192 (160KB) + - [16, 16, 32, 1, 1, 6, 14, 2, 2] # MT 192x448 (160KB) + - [16, 16, 32, 1, 1, 16, 4, 2, 2] # MT 512x128 (160KB) + - [16, 16, 32, 1, 1, 4, 16, 2, 2] # MT 128x512 (160KB) + # Bug: DataInitTypeBeta:2 produces incorrect results for these large tiles + #- [16, 16, 32, 1, 1, 9, 11, 2, 2] # MT 288x352 (160KB) + #- [16, 16, 32, 1, 1, 11, 9, 2, 2] # MT 352x288 (160KB) + - [16, 16, 32, 1, 1, 9, 9, 2, 2] # MT 288x288 (144KB, large symmetric) + - [16, 16, 32, 1, 1, 14, 4, 2, 2] # MT 448x128 (144KB) + - [16, 16, 32, 1, 1, 12, 6, 2, 2] # MT 384x192 (144KB) + # 2x2 wave group — asymmetric + - [16, 16, 32, 1, 1, 3, 4, 2, 2] # MT 96x128 (odd M wavetile) + - [16, 16, 32, 1, 1, 8, 6, 2, 2] # MT 256x192 (odd N wavetile) + - [16, 16, 32, 1, 1, 6, 2, 2, 2] # MT 192x64 (odd M wavetile) + - [16, 16, 32, 1, 1, 2, 6, 2, 2] # MT 64x192 (odd N wavetile) + - [16, 16, 32, 1, 1, 6, 4, 2, 2] # MT 192x128 (odd M wavetile) + # 4x1 wave group + - [16, 16, 32, 1, 1, 2, 2, 4, 1] # MT 128x32 + - [16, 16, 32, 1, 1, 2, 6, 4, 1] # MT 128x96 (odd N wavetile) + - [16, 16, 32, 1, 1, 6, 4, 4, 1] # MT 384x64 (odd M wavetile) + # 4x1 wave group — large tiles at/near LDS limit + - [16, 16, 32, 1, 1, 8, 8, 4, 1] # MT 512x128 (160KB) + - [16, 16, 32, 1, 1, 7, 12, 4, 1] # MT 448x192 (160KB) + - [16, 16, 32, 1, 1, 6, 16, 4, 1] # MT 384x256 (160KB) + - [16, 16, 32, 1, 1, 9, 4, 4, 1] # MT 576x64 (160KB) + # 1x4 wave group + - [16, 16, 32, 1, 1, 2, 6, 1, 4] # MT 32x384 (odd N wavetile) + - [16, 16, 32, 1, 1, 6, 4, 1, 4] # MT 96x256 (odd M wavetile) + # 1x4 wave group — large tiles at/near LDS limit + - [16, 16, 32, 1, 1, 8, 8, 1, 4] # MT 128x512 (160KB) + - [16, 16, 32, 1, 1, 12, 7, 1, 4] # MT 192x448 (160KB) + - [16, 16, 32, 1, 1, 16, 6, 1, 4] # MT 256x384 (160KB) + - [16, 16, 32, 1, 1, 4, 9, 1, 4] # MT 64x576 (160KB) + # Odd wave tile combinations + - [16, 16, 32, 1, 1, 3, 2, 2, 2] # MT 96x64 (3x2 wavetile) + - [16, 16, 32, 1, 1, 2, 3, 2, 2] # MT 64x96 (2x3 wavetile) + - [16, 16, 32, 1, 1, 2, 9, 4, 1] # MT 128x144 (very asymmetric N) + - [16, 16, 32, 1, 1, 9, 2, 1, 4] # MT 144x128 (very asymmetric M) + - PrefetchGlobalRead: [0, 2] + - PrefetchLocalRead: [0] + - DepthU: [64] # 2*MI_K = 2*32 + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # Full-tile sizes (K = 1×DU .. 4×DU) + - Exact: [128, 128, 1, 64] # K = 1*DU (minimum) + - Exact: [128, 128, 1, 128] # K = 2*DU + - Exact: [256, 256, 1, 192] # K = 3*DU + - Exact: [256, 256, 1, 256] # K = 4*DU + # Mult-of-32 edges: M,N not multiples of macro tile + - Exact: [96, 96, 1, 128] # 3*32, edge for MT >= 128 + - Exact: [160, 160, 1, 128] # 5*32, edge for MT >= 192 + - Exact: [224, 224, 1, 128] # 7*32, edge for MT >= 256 + - Exact: [288, 288, 1, 128] # 9*32, edge for MT >= 320 + - Exact: [352, 352, 1, 128] # 11*32, edge for MT >= 384 + - Exact: [416, 416, 1, 128] # 13*32, edge for MT >= 448 + - Exact: [480, 480, 1, 128] # 15*32, edge for MT >= 512 + - Exact: [544, 544, 1, 128] # 17*32, edge for MT >= 576 + # Full-tile sizes for large macro tiles + - Exact: [384, 256, 1, 128] # full tile for MT 384x256 + - Exact: [320, 320, 1, 128] # full tile for MT 320x320 + - Exact: [256, 384, 1, 128] # full tile for MT 256x384 + - Exact: [448, 192, 1, 128] # full tile for MT 448x192 + - Exact: [192, 448, 1, 128] # full tile for MT 192x448 + - Exact: [512, 128, 1, 128] # full tile for MT 512x128 + - Exact: [128, 512, 1, 128] # full tile for MT 128x512 + - Exact: [288, 352, 1, 128] # full tile for MT 288x352 + - Exact: [352, 288, 1, 128] # full tile for MT 352x288 + - Exact: [288, 288, 1, 128] # full tile for MT 288x288 + - Exact: [448, 128, 1, 128] # full tile for MT 448x128 + - Exact: [384, 192, 1, 128] # full tile for MT 384x192 + - Exact: [576, 64, 1, 128] # full tile for MT 576x64 + - Exact: [ 64, 576, 1, 128] # full tile for MT 64x576 + # Asymmetric M vs N + - Exact: [96, 256, 1, 128] # M edge, N full tile + - Exact: [256, 96, 1, 128] # M full tile, N edge + # Small targeted ranges: odd M,N not aligned to 32 (no BF16 alignment restriction) + # Range format: [start, step, stop]; values: start, start+step, ... + - Range: [[17, 32, 113], [17, 32, 113], [1], [128]] # 17,49,81,113 — offset by 1 from 32-boundary + - Range: [[33, 64, 161], [33, 64, 161], [1], [128]] # 33,97,161 — just above 32/64 multiples + + ######################################## + # BF16 TN — no bias, large DU (DU = 128, 192) + # + # Tests deeper unroll depths with a representative subset of wave tile configs. + # K is restricted to multiples of LCM(128, 192) = 384, ensuring it is a + # valid multiple of every DU value in the fork (no tail loop needed). + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + ActivationFuncCall: True + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 32, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 32, 1, 1, 8, 8, 2, 2] # MT 256x256 + - [16, 16, 32, 1, 1, 6, 4, 2, 2] # MT 192x128 (asymmetric) + - [16, 16, 32, 1, 1, 2, 6, 4, 1] # MT 128x96 (4x1 wave group) + - PrefetchGlobalRead: [0, 2] + - PrefetchLocalRead: [0] + - DepthU: [128, 192] # > 2*MI_K; K must be mult of LCM(128,192)=384 + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # Full tiles, K = multiples of LCM(128, 192) = 384 + - Exact: [128, 128, 1, 384] + - Exact: [256, 256, 1, 384] + # Mult-of-32 edges: M,N not multiples of macro tile + - Exact: [96, 96, 1, 384] # 3*32, edge for MT >= 128 + - Exact: [160, 160, 1, 384] # 5*32, edge for MT >= 192 + - Exact: [224, 224, 1, 384] # 7*32, edge for MT >= 256 + # Targeted ranges for odd M,N (BF16: no alignment restriction) + - Range: [[17, 32, 113], [17, 32, 113], [1], [384]] # 17,49,81,113 + - Range: [[33, 64, 161], [33, 64, 161], [1], [384]] # 33,97,161 + + ######################################## + # BSS TN — BF16 input, F32 output, DU = 2*MI_K = 64 + # + # Reduced wavetile coverage; same problem size coverage as BF16 main group. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: s + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + ActivationFuncCall: True + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 32, 1, 1, 2, 2, 1, 1] # MT 32x32 (1x1 WG) + - [16, 16, 32, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 32, 1, 1, 8, 8, 2, 2] # MT 256x256 + - [16, 16, 32, 1, 1, 3, 4, 2, 2] # MT 96x128 (odd M wavetile) + - [16, 16, 32, 1, 1, 6, 4, 2, 2] # MT 192x128 (asymmetric) + - [16, 16, 32, 1, 1, 2, 6, 4, 1] # MT 128x96 (4x1 WG) + - PrefetchGlobalRead: [0, 2] + - PrefetchLocalRead: [0] + - DepthU: [64] # 2*MI_K = 2*32 + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # Full-tile sizes + - Exact: [128, 128, 1, 128] + - Exact: [256, 256, 1, 256] + # Mult-of-32 edges + - Exact: [96, 96, 1, 128] + - Exact: [160, 160, 1, 128] + - Exact: [224, 224, 1, 128] + # Asymmetric + - Exact: [96, 256, 1, 128] + - Exact: [256, 96, 1, 128] + # Odd M,N (no alignment restriction for BF16/BSS) + - Range: [[17, 32, 113], [17, 32, 113], [1], [128]] + - Range: [[33, 64, 161], [33, 64, 161], [1], [128]] + + ######################################## + # BF16 TN — bias epilogue, BF16 output (BBS) + # + # Covers even and odd wavetile sizes, both StreamK=0 and SK=3. + # Tests f32 bias with none/relu/gelu activations. + # Full-tile and edge sizes. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + Activation: True + ActivationType: hipblaslt_all + UseBias: 1 + BiasDataTypeList: [s] + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + # Even wavetile sizes + - [16, 16, 32, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 32, 1, 1, 8, 8, 2, 2] # MT 256x256 + - [16, 16, 32, 1, 1, 2, 6, 4, 1] # MT 128x96 (4x1 wave group) + # Odd M wavetile (exercises NonEdge guard ceiling division) + - [16, 16, 32, 1, 1, 3, 4, 2, 2] # MT 96x128 (waveGroupM=48) + - [16, 16, 32, 1, 1, 6, 4, 2, 2] # MT 192x128 (waveGroupM=96) + - PrefetchGlobalRead: [0,2] + - PrefetchLocalRead: [0] + - DepthU: [64] + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # Full-tile size + - Exact: [128, 128, 1, 128] + # Edge: M,N not multiple of macro tile + - Exact: [160, 160, 1, 128] + - BiasTypeArgs: ['s'] + - ActivationArgs: + - [Enum: none] + - [Enum: relu] + - [Enum: gelu] + + ######################################## + # BF16 TN — all hipblaslt activations (BBS) with ScaleAlphaVec + # + # Exercises every activation function supported by hipblaslt_all: + # none, relu, gelu, sigmoid, silu, clamp, dgelu, drelu. + # Small matrix selection and single problem size to keep runtime short. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + Activation: True + ActivationType: hipblaslt_all + UseBias: 1 + BiasDataTypeList: [s] + UseScaleAlphaVec: 1 + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 32, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 32, 1, 1, 3, 4, 2, 2] # MT 96x128 (odd M wavetile) + - PrefetchGlobalRead: [0,2] + - PrefetchLocalRead: [0] + - DepthU: [64] + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [128, 128, 1, 128] + - BiasTypeArgs: ['s'] + - ActivationArgs: + - [Enum: none] + - [Enum: relu] + - [Enum: gelu] + - [Enum: sigmoid] + - [Enum: silu] + - [Enum: clamp] + - [Enum: dgelu] + - [Enum: drelu] + + ######################################## + # BF16 TN — bias epilogue, F32 output (BSS) + # + # mBlockSize=16 for F32 dest; covers even and odd wavetile. + # StreamK=0 and SK=3. Full-tile and edge sizes. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: s + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + Activation: True + ActivationType: hipblaslt_all + UseBias: 1 + BiasDataTypeList: [s] + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 32, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 32, 1, 1, 8, 8, 2, 2] # MT 256x256 + - [16, 16, 32, 1, 1, 3, 4, 2, 2] # MT 96x128 (odd M wavetile) + - PrefetchGlobalRead: [0] + - PrefetchLocalRead: [0] + - DepthU: [64] + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [128, 128, 1, 128] + - Exact: [160, 160, 1, 128] + - BiasTypeArgs: ['s'] + - ActivationArgs: + - [Enum: none] + - [Enum: relu] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_mxfp4.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_mxfp4.yaml new file mode 100644 index 00000000000..be6b1e1d63d --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_mxfp4.yaml @@ -0,0 +1,705 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + MinimumRequiredVersion: 5.0.0 + PrintLevel: 3 + Device: 0 + CMakeBuildType: Release + MergeFiles: False + KernelTime: True + MaxWorkspaceSize: 13421772800 + DataInitTypeA: 3 + DataInitTypeB: 3 + DataInitTypeC: 0 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 + DataInitTypeBias: 1 + DataInitTypeMXSA: 3 + DataInitTypeMXSB: 3 + MXScaleFormat: 1 + BoundsCheck: 0 + KeepBuildTmp: True + DeviceLDS: 163840 + MaxLDS: 163840 + #PrintSolutionRejectionReason: True + #GenerateSourcesAndExit: False + CpuThreads: 1 + RotatingBufferSize: 0 + +BenchmarkProblems: + ######################################## + # FP4 TN — no bias, DU = 2*MI_K = 256 + # + # Same wave tile coverage as BF16 (minus BF16-specific odd additions). + # K is always a multiple of DU=256. + # M,N: multiples of 64 (full tiles/edges), multiples of 16, odd, and + # other non-aligned values to exercise scale padding across granularities. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: F4 + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + MXBlockA: 32 + MXBlockB: 32 + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + ActivationFuncCall: True + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + # 1x1 wave group + - [16, 16, 128, 1, 1, 2, 2, 1, 1] # MT 32x32 + - [16, 16, 128, 1, 1, 4, 2, 1, 1] # MT 64x32 + # 2x2 wave group — symmetric + - [16, 16, 128, 1, 1, 2, 2, 2, 2] # MT 64x64 + - [16, 16, 128, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 128, 1, 1, 8, 8, 2, 2] # MT 256x256 + # 2x2 wave group — asymmetric + - [16, 16, 128, 1, 1, 6, 8, 2, 2] # MT 192x256 + - [16, 16, 128, 1, 1, 8, 6, 2, 2] # MT 256x192 + - [16, 16, 128, 1, 1, 6, 2, 2, 2] # MT 192x64 + - [16, 16, 128, 1, 1, 2, 6, 2, 2] # MT 64x192 + - [16, 16, 128, 1, 1, 6, 4, 2, 2] # MT 192x128 + # 4x1 wave group + - [16, 16, 128, 1, 1, 2, 2, 4, 1] # MT 128x32 + - [16, 16, 128, 1, 1, 2, 6, 4, 1] # MT 128x96 + - [16, 16, 128, 1, 1, 6, 4, 4, 1] # MT 384x64 + # 1x4 wave group + - [16, 16, 128, 1, 1, 2, 6, 1, 4] # MT 32x384 + - [16, 16, 128, 1, 1, 6, 4, 1, 4] # MT 96x256 + - PrefetchGlobalRead: [0, 2] + - PrefetchLocalRead: [0] + - DepthU: [256] # 2*MI_K = 2*128 + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # Full-tile sizes (K = 1×DU, 2×DU) + - Exact: [128, 128, 1, 256] # K = 1*DU + - Exact: [256, 256, 1, 512] # K = 2*DU + # Edge cases: M,N = k*64 (FP4 granularity) but not multiples of macro tile + - Exact: [64, 64, 1, 256] # 1*64, edge for MT >= 96 + - Exact: [192, 192, 1, 256] # 3*64, edge for MT=128,256,384 + - Exact: [320, 320, 1, 256] # 5*64, edge for MT=128,192,256,384 + - Exact: [448, 448, 1, 256] # 7*64, edge for MT=128,192,256,384 + # Asymmetric edge cases + - Exact: [192, 256, 1, 256] # M edge, N full tile + - Exact: [256, 192, 1, 256] # M full tile, N edge + # Non-multiple-of-32 M,N (multiples of 16) + - Exact: [48, 48, 1, 256] # 3*16, both M,N non-mult-of-32 + - Exact: [112, 176, 1, 256] # 7*16, 11*16 + - Exact: [48, 128, 1, 256] # M non-mult-of-32, N full tile + - Exact: [128, 48, 1, 256] # M full tile, N non-mult-of-32 + - Exact: [240, 112, 1, 256] # 15*16, 7*16 + # Even non-multiple-of-16 M,N + - Exact: [10, 34, 1, 256] # both even, neither mult-of-16 + - Exact: [50, 100, 1, 256] # 50=2*25, 100=4*25 + - Exact: [66, 128, 1, 256] # M even non-mult-of-16, N full tile + - Exact: [128, 66, 1, 256] # M full tile, N even non-mult-of-16 + # Odd M,N + - Exact: [17, 33, 1, 256] # both odd + - Exact: [63, 63, 1, 256] # both odd, just below 64 + - Exact: [97, 129, 1, 256] # both odd, just above 96/128 + - Exact: [33, 128, 1, 256] # M odd, N full tile + - Exact: [128, 33, 1, 256] # M full tile, N odd + - Exact: [3, 5, 1, 256] # very small odd + # Batched (batch_count > 1) + - Exact: [128, 128, 2, 256] # batch=2, full tile + - Exact: [256, 256, 3, 256] # batch=3, full tile + - Exact: [192, 192, 2, 256] # batch=2, edge case + - Exact: [48, 48, 2, 256] # batch=2, non-mult-of-32 + - Exact: [33, 65, 2, 256] # batch=2, both odd + + ######################################## + # FP4 TN — no bias, large DU (DU = 512) + # + # Tests deeper unroll depth. K restricted to multiples of 512. + # M,N: multiples of 64 (full tiles/edges), multiples of 16, odd, and + # other non-aligned values to exercise scale padding across granularities. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: F4 + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + MXBlockA: 32 + MXBlockB: 32 + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + ActivationFuncCall: True + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 128, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 128, 1, 1, 8, 8, 2, 2] # MT 256x256 + - [16, 16, 128, 1, 1, 6, 4, 2, 2] # MT 192x128 (asymmetric) + - [16, 16, 128, 1, 1, 2, 6, 4, 1] # MT 128x96 (4x1 wave group) + - PrefetchGlobalRead: [0, 2] + - PrefetchLocalRead: [1] + - DepthU: [512] # 4*MI_K; K must be mult of 512 + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # Full tiles, K = multiples of DU=512 + - Exact: [128, 128, 1, 512] + - Exact: [256, 256, 1, 1024] + # Edge cases: M,N = k*64 (FP4 granularity) but not multiples of macro tile + - Exact: [64, 64, 1, 512] # 1*64, edge for MT >= 96 + - Exact: [192, 192, 1, 512] # 3*64, edge for MT=128,256,384 + - Exact: [320, 320, 1, 512] # 5*64, edge for MT=128,192,256,384 + # Asymmetric edge case + - Exact: [192, 256, 1, 512] # M edge, N full tile + # Non-multiple-of-32 M,N (multiples of 16) + - Exact: [48, 48, 1, 512] # 3*16, both M,N non-mult-of-32 + - Exact: [112, 176, 1, 512] # 7*16, 11*16 + # Even non-multiple-of-16 M,N + - Exact: [50, 100, 1, 512] # 50=2*25, 100=4*25 + - Exact: [66, 66, 1, 512] # both even, neither mult-of-16 + # Odd M,N + - Exact: [17, 33, 1, 512] # both odd + - Exact: [63, 63, 1, 512] # both odd, just below 64 + - Exact: [97, 129, 1, 512] # both odd, just above 96/128 + # Batched (batch_count > 1) + - Exact: [128, 128, 2, 512] # batch=2, full tile + - Exact: [63, 63, 2, 512] # batch=2, both odd + + ######################################## + # F4HS TN — MX FP4 input, F16 output, DU = 2*MI_K = 256 + # + # Reduced wavetile coverage; same problem size coverage as FP4 main group. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: F4 + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + MXBlockA: 32 + MXBlockB: 32 + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + ActivationFuncCall: True + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 128, 1, 1, 2, 2, 1, 1] # MT 32x32 (1x1 WG) + - [16, 16, 128, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 128, 1, 1, 8, 8, 2, 2] # MT 256x256 + - [16, 16, 128, 1, 1, 6, 4, 2, 2] # MT 192x128 (asymmetric) + - [16, 16, 128, 1, 1, 2, 6, 4, 1] # MT 128x96 (4x1 WG) + - PrefetchGlobalRead: [0, 2] + - PrefetchLocalRead: [0] + - DepthU: [256] # 2*MI_K = 2*128 + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # Full-tile sizes (K = 1×DU, 2×DU) + - Exact: [128, 128, 1, 256] + - Exact: [256, 256, 1, 512] + # Edge cases: M,N = k*64 but not multiples of macro tile + - Exact: [64, 64, 1, 256] + - Exact: [192, 192, 1, 256] + - Exact: [320, 320, 1, 256] + # Asymmetric + - Exact: [192, 256, 1, 256] + - Exact: [256, 192, 1, 256] + # Non-multiple-of-32 M,N (multiples of 16) + - Exact: [48, 48, 1, 256] # 3*16, both M,N non-mult-of-32 + - Exact: [112, 176, 1, 256] # 7*16, 11*16 + # Even non-multiple-of-16 M,N + - Exact: [50, 100, 1, 256] # 50=2*25, 100=4*25 + - Exact: [66, 66, 1, 256] # both even, neither mult-of-16 + # Odd M,N + - Exact: [17, 33, 1, 256] # both odd + - Exact: [63, 63, 1, 256] # both odd, just below 64 + # Batched (batch_count > 1) + - Exact: [128, 128, 2, 256] # batch=2, full tile + - Exact: [192, 192, 3, 256] # batch=3, edge case + - Exact: [50, 100, 2, 256] # batch=2, non-mult-of-32 + + ######################################## + # F4HS TN — bias epilogue, F16 output + # + # Covers even and odd wavetile sizes with f16 destination. + # StreamK=0 and SK=3. Full-tile and edge sizes. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: F4 + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + MXBlockA: 32 + MXBlockB: 32 + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + Activation: True + ActivationType: hipblaslt_all + UseBias: 1 + BiasDataTypeList: [s, h] + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 128, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 128, 1, 1, 8, 8, 2, 2] # MT 256x256 + - [16, 16, 128, 1, 1, 6, 4, 2, 2] # MT 192x128 (odd M wavetile) + - PrefetchGlobalRead: [0] + - PrefetchLocalRead: [0] + - DepthU: [256] + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [128, 128, 1, 256] + - Exact: [128, 128, 2, 256] # batch=2, with bias + - Exact: [192, 192, 1, 256] + - BiasTypeArgs: ['s', 'h'] + - ActivationArgs: + - [Enum: none] + - [Enum: relu] + + ######################################## + # F4SS TN — MX FP4 input, F32 output, DU = 2*MI_K = 256 + # + # Reduced wavetile coverage; same problem size coverage as FP4 main group. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: F4 + DestDataType: s + ComputeDataType: s + HighPrecisionAccumulate: True + MXBlockA: 32 + MXBlockB: 32 + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + ActivationFuncCall: True + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 128, 1, 1, 2, 2, 1, 1] # MT 32x32 (1x1 WG) + - [16, 16, 128, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 128, 1, 1, 8, 8, 2, 2] # MT 256x256 + - [16, 16, 128, 1, 1, 6, 4, 2, 2] # MT 192x128 (asymmetric) + - [16, 16, 128, 1, 1, 2, 6, 4, 1] # MT 128x96 (4x1 WG) + - PrefetchGlobalRead: [0, 2] + - PrefetchLocalRead: [0] + - DepthU: [256] # 2*MI_K = 2*128 + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # Full-tile sizes (K = 1×DU, 2×DU) + - Exact: [128, 128, 1, 256] + - Exact: [256, 256, 1, 512] + # Edge cases: M,N = k*64 but not multiples of macro tile + - Exact: [64, 64, 1, 256] + - Exact: [192, 192, 1, 256] + - Exact: [320, 320, 1, 256] + # Asymmetric + - Exact: [192, 256, 1, 256] + - Exact: [256, 192, 1, 256] + # Non-multiple-of-32 M,N (multiples of 16) + - Exact: [48, 48, 1, 256] # 3*16, both M,N non-mult-of-32 + - Exact: [112, 176, 1, 256] # 7*16, 11*16 + # Even non-multiple-of-16 M,N + - Exact: [50, 100, 1, 256] # 50=2*25, 100=4*25 + - Exact: [66, 66, 1, 256] # both even, neither mult-of-16 + # Odd M,N + - Exact: [17, 33, 1, 256] # both odd + - Exact: [63, 63, 1, 256] # both odd, just below 64 + # Batched (batch_count > 1) + - Exact: [128, 128, 2, 256] # batch=2, full tile + - Exact: [192, 192, 3, 256] # batch=3, edge case + - Exact: [50, 100, 2, 256] # batch=2, non-mult-of-32 + + ######################################## + # FP4 TN — bias epilogue, BF16 output (F4BS) + # + # Covers even and odd wavetile sizes, both StreamK=0 and SK=3. + # Tests f32 bias with none/relu/gelu activations. + # Full-tile and edge sizes. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: F4 + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + MXBlockA: 32 + MXBlockB: 32 + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + Activation: True + ActivationType: hipblaslt_all + UseBias: 1 + BiasDataTypeList: [s] + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + # Even wavetile sizes + - [16, 16, 128, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 128, 1, 1, 8, 8, 2, 2] # MT 256x256 + - [16, 16, 128, 1, 1, 2, 6, 4, 1] # MT 128x96 (4x1 wave group) + # Odd M wavetile + - [16, 16, 128, 1, 1, 6, 4, 2, 2] # MT 192x128 + - [16, 16, 128, 1, 1, 6, 8, 2, 2] # MT 192x256 + - PrefetchGlobalRead: [0] + - PrefetchLocalRead: [0] + - DepthU: [256] + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # Full-tile size + - Exact: [128, 128, 1, 256] + # Edge: M,N not multiple of macro tile + - Exact: [192, 192, 1, 256] + - BiasTypeArgs: ['s'] + - ActivationArgs: + - [Enum: none] + - [Enum: relu] + - [Enum: gelu] + + ######################################## + # FP4 TN — bias epilogue, F32 output (F4SS) + # + # Covers even and odd wavetile sizes with f32 destination. + # StreamK=0 and SK=3. Full-tile and edge sizes. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: F4 + DestDataType: s + ComputeDataType: s + HighPrecisionAccumulate: True + MXBlockA: 32 + MXBlockB: 32 + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + Activation: True + ActivationType: hipblaslt_all + UseBias: 1 + BiasDataTypeList: [s] + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 128, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 128, 1, 1, 8, 8, 2, 2] # MT 256x256 + - [16, 16, 128, 1, 1, 6, 4, 2, 2] # MT 192x128 (odd M wavetile) + - PrefetchGlobalRead: [0] + - PrefetchLocalRead: [0] + - DepthU: [256] + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [128, 128, 1, 256] + - Exact: [128, 128, 2, 256] # batch=2, with bias + - Exact: [192, 192, 1, 256] + - BiasTypeArgs: ['s'] + - ActivationArgs: + - [Enum: none] + - [Enum: relu] + + ######################################## + # FP4 TN — all hipblaslt activations with ScaleAlphaVec + # + # Exercises every activation function supported by hipblaslt_all: + # none, relu, gelu, sigmoid, silu, clamp, dgelu, drelu. + # Small matrix selection and single problem size to keep runtime short. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: F4 + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + MXBlockA: 32 + MXBlockB: 32 + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + Activation: True + ActivationType: hipblaslt_all + UseBias: 1 + BiasDataTypeList: [s] + UseScaleAlphaVec: 1 + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 128, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 128, 1, 1, 6, 4, 2, 2] # MT 192x128 (asymmetric) + - PrefetchGlobalRead: [0,2] + - PrefetchLocalRead: [0] + - DepthU: [256] + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [128, 128, 1, 256] + - BiasTypeArgs: ['s'] + - ActivationArgs: + - [Enum: none] + - [Enum: relu] + - [Enum: gelu] + - [Enum: sigmoid] + - [Enum: silu] + - [Enum: clamp] + - [Enum: dgelu] + - [Enum: drelu] + + ######################################## + # FP4 TN — PGR=2, PLR=1 (scheduler path) + # + # The PGR=2 scheduler takes a different code path than PGR=0. + # Rectangular MIWTs exercise loadRatioGR != 1.0. + # K=768 tests K > DU (K-loop with SRD advance). + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: F4 + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + MXBlockA: 32 + MXBlockB: 32 + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + ActivationFuncCall: True + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + # 2x2 wave group — symmetric + - [16, 16, 128, 1, 1, 2, 2, 2, 2] # MT 64x64 + - [16, 16, 128, 1, 1, 4, 4, 2, 2] # MT 128x128 + - [16, 16, 128, 1, 1, 8, 8, 2, 2] # MT 256x256 + # 2x2 wave group — asymmetric + - [16, 16, 128, 1, 1, 2, 4, 2, 2] # MT 64x128 + - [16, 16, 128, 1, 1, 4, 2, 2, 2] # MT 128x64 + - [16, 16, 128, 1, 1, 2, 6, 2, 2] # MT 64x192 + - [16, 16, 128, 1, 1, 6, 2, 2, 2] # MT 192x64 + - [16, 16, 128, 1, 1, 4, 8, 2, 2] # MT 128x256 + - [16, 16, 128, 1, 1, 8, 4, 2, 2] # MT 256x128 + - [16, 16, 128, 1, 1, 6, 4, 2, 2] # MT 192x128 + - [16, 16, 128, 1, 1, 4, 6, 2, 2] # MT 128x192 + - [16, 16, 128, 1, 1, 6, 8, 2, 2] # MT 192x256 + - [16, 16, 128, 1, 1, 8, 6, 2, 2] # MT 256x192 + - PrefetchGlobalRead: [0, 2] + - PrefetchLocalRead: [1] + - DepthU: [256] + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [128, 128, 1, 256] # K = 1*DU + - Exact: [256, 256, 1, 256] # K = 1*DU, larger tile + - Exact: [256, 256, 1, 768] # K = 3*DU (tests K-loop) + - Exact: [192, 192, 1, 256] # Edge for MT=128,256 + - Exact: [192, 256, 1, 768] # Asymmetric edge, K > DU + + ######################################## + # FP4 TN — expanded MIWT coverage (PGR=0) + # + # Additional rectangular MIWTs and large tiles. + # K=768 tests K > DU with PGR=0. + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: F4 + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + MXBlockA: 32 + MXBlockB: 32 + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + ActivationFuncCall: True + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + # Rectangular MIWTs not in the main PGR=0 FP4 group + - [16, 16, 128, 1, 1, 2, 4, 2, 2] # MT 64x128 + - [16, 16, 128, 1, 1, 4, 2, 2, 2] # MT 128x64 + - [16, 16, 128, 1, 1, 2, 8, 2, 2] # MT 64x256 + - [16, 16, 128, 1, 1, 8, 2, 2, 2] # MT 256x64 + - [16, 16, 128, 1, 1, 4, 8, 2, 2] # MT 128x256 + - [16, 16, 128, 1, 1, 8, 4, 2, 2] # MT 256x128 + - [16, 16, 128, 1, 1, 4, 6, 2, 2] # MT 128x192 + - [16, 16, 128, 1, 1, 6, 6, 2, 2] # MT 192x192 + # Large tiles (> 256 in one dim) + - [16, 16, 128, 1, 1, 2, 10, 2, 2] # MT 64x320 + - [16, 16, 128, 1, 1, 10, 2, 2, 2] # MT 320x64 + - [16, 16, 128, 1, 1, 2, 12, 2, 2] # MT 64x384 + - [16, 16, 128, 1, 1, 12, 2, 2, 2] # MT 384x64 + - [16, 16, 128, 1, 1, 4, 10, 2, 2] # MT 128x320 + - [16, 16, 128, 1, 1, 10, 4, 2, 2] # MT 320x128 + - [16, 16, 128, 1, 1, 4, 12, 2, 2] # MT 128x384 + - [16, 16, 128, 1, 1, 12, 4, 2, 2] # MT 384x128 + # More WG 4x1 configs + - [16, 16, 128, 1, 1, 2, 4, 4, 1] # MT 128x64 + - [16, 16, 128, 1, 1, 2, 8, 4, 1] # MT 128x128 + - [16, 16, 128, 1, 1, 4, 4, 4, 1] # MT 256x64 + - [16, 16, 128, 1, 1, 4, 8, 4, 1] # MT 256x128 + # More WG 1x4 configs + - [16, 16, 128, 1, 1, 4, 2, 1, 4] # MT 64x128 + - [16, 16, 128, 1, 1, 8, 2, 1, 4] # MT 128x128 + - [16, 16, 128, 1, 1, 4, 4, 1, 4] # MT 64x256 + - [16, 16, 128, 1, 1, 8, 4, 1, 4] # MT 128x256 + - PrefetchGlobalRead: [0, 2] + - PrefetchLocalRead: [0] + - DepthU: [256] + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [128, 128, 1, 256] # K = 1*DU + - Exact: [256, 256, 1, 256] # K = 1*DU, larger + - Exact: [256, 256, 1, 768] # K = 3*DU (K-loop) + - Exact: [192, 192, 1, 256] # Edge + - Exact: [384, 384, 1, 256] # Large tile edge + + # PGR=2 WG 4x1/1x4, K > DU — re-enabled after scheduler fix. + - + - OperationType: GEMM + DataType: F4 + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + MXBlockA: 32 + MXBlockB: 32 + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + ActivationFuncCall: True + + - InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 128, 1, 1, 2, 2, 4, 1] # MT 128x32 + - [16, 16, 128, 1, 1, 2, 4, 4, 1] # MT 128x64 + - [16, 16, 128, 1, 1, 4, 4, 4, 1] # MT 256x64 + - [16, 16, 128, 1, 1, 2, 2, 1, 4] # MT 32x128 + - [16, 16, 128, 1, 1, 4, 2, 1, 4] # MT 64x128 + - [16, 16, 128, 1, 1, 4, 4, 1, 4] # MT 64x256 + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - DepthU: [256] + - ScheduleIterAlg: [3] + - DirectToLds: [1] + - StreamK: [0, 3] + - StaggerU: [0] + - UseSubtileImpl: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [256, 256, 1, 768] # K = 3*DU (triggers failure) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/sk_mx32f4_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/sk_mx32f4_quick.yaml index 95efebb8370..d755e4fc97b 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/sk_mx32f4_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/sk_mx32f4_quick.yaml @@ -24,6 +24,7 @@ GlobalParameters: MaxFileName: 128 DeviceLDS: 163840 MaxLDS: 163840 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -68,11 +69,11 @@ BenchmarkProblems: # - UseSgprForGRO: [0,1] - UseSgprForGRO: [0] # - DepthU: [64, 128] - - DepthU: [128] + - DepthU: [256] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - AssertSummationElementMultiple: [64] - - LocalReadVectorWidth: [16] + - LocalReadVectorWidth: [32] # - PrefetchGlobalRead: [0,1,2] - PrefetchGlobalRead: [2] # - PrefetchLocalRead: [0,1] @@ -96,6 +97,7 @@ BenchmarkProblems: - StoreVectorWidth: [-1] - SourceSwap: [1] - StreamK: [3] + - UseSubtileImpl: [1] # TODO: enable subtile BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/sk_mx32f8_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/sk_mx32f8_quick.yaml index b2965f8bc46..a5139ca97cb 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/sk_mx32f8_quick.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/sk_mx32f8_quick.yaml @@ -1,5 +1,6 @@ TestParameters: - marks: [skip-gfx1250, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] + # need to skip for gfx950: TODO: re-enable this with subtile + MXScaleFormat=1 + marks: [skip-gfx1250, skip-gfx950, skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] GlobalParameters: NumElementsToValidate: -1 @@ -24,6 +25,7 @@ GlobalParameters: MaxFileName: 128 DeviceLDS: 163840 MaxLDS: 163840 + MXScaleFormat: 1 BenchmarkProblems: ######################################## @@ -68,7 +70,7 @@ BenchmarkProblems: # - UseSgprForGRO: [0,1] - UseSgprForGRO: [0] # - DepthU: [64, 128] - - DepthU: [128] + - DepthU: [256] - AssertFree0ElementMultiple: [1] - AssertFree1ElementMultiple: [1] - AssertSummationElementMultiple: [64] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/unit/conftest.py b/projects/hipblaslt/tensilelite/Tensile/Tests/unit/conftest.py new file mode 100644 index 00000000000..8615535288e --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/unit/conftest.py @@ -0,0 +1,62 @@ +"""pytest configuration and CLI options for store-D unit tests.""" + +import pytest + + +@pytest.fixture(autouse=True) +def skip_parametrized_if_cli(request): + """Skip all tests except test_storeD_cli when CLI options (--mn) are provided.""" + if (request.config.getoption("--mn", default=None) is not None + and request.function.__name__ != "test_storeD_cli"): + pytest.skip("--mn specified: only test_storeD_cli runs") + + +def pytest_addoption(parser): + parser.addoption( + "--mn", nargs="+", metavar="M,N", + help="List of M,N pairs to test, e.g. --mn 23,17 32,32 16,16", + ) + parser.addoption( + "--mt", nargs="+", metavar="MT0,MT1", + help="List of MacroTile pairs to test, e.g. --mt 16,16 32,32", + ) + parser.addoption( + "--wave-config", nargs="+", metavar="WG0,WG1", + help="List of MIWaveGroup pairs to test, e.g. --wave-config 1,1 2,2", + ) + parser.addoption( + "--dump-asm", action="store_true", default=False, + help="Dump generated assembly and store module text for each test case", + ) + parser.addoption( + "--dump-store-insts", action="store_true", default=False, + help="Print only the buffer_store_* instructions emitted by the store-D path, " + "one per line with the preceding comment for context", + ) + parser.addoption( + "--asm-output-dir", default=None, metavar="DIR", + help="Write the full assembled kernel source (.s file) for each CLI test case " + "to DIR/test_