Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 205 additions & 0 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,13 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
b1 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]);
}
#endif
const bool transa_rt = (p.fusion_flags & MAT_VEC_FUSION_FLAGS_TRANSPOSE_A) != 0;
const uint a_kb = col / QUANT_K;
uint ibi = first_row*p.ncols;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib = (ibi + col)/QUANT_K; // block index
const uint ib = transa_rt
? (a_kb * p.stride_d + (first_row + n))
: ((ibi + col)/QUANT_K);
ibi += p.ncols;

#if K_PER_ITER == 8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#define MAT_VEC_FUSION_FLAGS_BIAS1 0x2
#define MAT_VEC_FUSION_FLAGS_SCALE0 0x4
#define MAT_VEC_FUSION_FLAGS_SCALE1 0x8
#define MAT_VEC_FUSION_FLAGS_TRANSPOSE_A 0x10

layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
#if defined(A_TYPEV4)
Expand Down
5 changes: 4 additions & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
const uint y1_idx = i * QUANT_K + y_offset;
const uint y2_idx = y1_idx + 128;

const bool transpose_a = (p.fusion_flags & MAT_VEC_FUSION_FLAGS_TRANSPOSE_A) != 0;

[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset + (first_row+n)*num_blocks_per_row;
const uint ib0 = transpose_a ? (a_offset + i * (p.stride_d - 1) + (first_row+n))
: (a_offset + (first_row+n)*num_blocks_per_row);
const FLOAT_TYPEV2 dm = FLOAT_TYPEV2(data_a[ib0 + i].dm);

const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
Expand Down
5 changes: 4 additions & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
const uint y1_idx = i * QUANT_K + y_offset;
const uint y2_idx = y1_idx + 128;

const bool transpose_a = (p.fusion_flags & MAT_VEC_FUSION_FLAGS_TRANSPOSE_A) != 0;

[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset + (first_row+n)*num_blocks_per_row;
const uint ib0 = transpose_a ? (a_offset + i * (p.stride_d - 1) + (first_row+n))
: (a_offset + (first_row+n)*num_blocks_per_row);
const FLOAT_TYPEV2 dm = FLOAT_TYPEV2(data_a[ib0 + i].dm);

const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
Expand Down
5 changes: 4 additions & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@ uint csel = 0;
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
const uint y_idx = i * QUANT_K + y_offset;

const bool transpose_a = (p.fusion_flags & MAT_VEC_FUSION_FLAGS_TRANSPOSE_A) != 0;

[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset + (first_row+n)*num_blocks_per_row;
const uint ib0 = transpose_a ? (a_offset + i * (p.stride_d - 1) + (first_row+n))
: (a_offset + (first_row+n)*num_blocks_per_row);
csel ^= 1;

if (!all_threads) { // when we don't have enough blocks to use all threads
Expand Down
9 changes: 8 additions & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,15 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
#endif

uint ibi = first_row*p.ncols;
const bool transpose_a = (p.fusion_flags & MAT_VEC_FUSION_FLAGS_TRANSPOSE_A) != 0;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint a_block_idx = (ibi + col)/QUANT_K_Q8_1 + a_offset;
#if defined(DATA_A_QUANT_K)
const uint a_block_idx = (transpose_a ? ((col / QUANT_K) * p.stride_d + (first_row + n)) * (QUANT_K / QUANT_K_Q8_1) + ((col / QUANT_K_Q8_1) & ((QUANT_K / QUANT_K_Q8_1) - 1))
: (ibi + col)/QUANT_K_Q8_1) + a_offset;
#else
const uint a_block_idx = (transpose_a ? (col / QUANT_K_Q8_1) * p.stride_d + first_row + n
: (ibi + col)/QUANT_K_Q8_1) + a_offset;
#endif
ibi += p.ncols;

temp[j][n] += mmvq_dot_product(a_block_idx, b_qs_idx);
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,11 @@ void main() {
#else
batch_idx_a * (p.batch_stride_a / LOAD_VEC_A) +
#endif
#ifdef TRANSPOSE_A
0;
#else
(ir * BM * p.stride_a + start_k) / LOAD_VEC_A;
#endif
#ifdef MUL_MAT_ID
uint pos_b = 0;
#else
Expand Down Expand Up @@ -286,7 +290,9 @@ void main() {

barrier();

#ifndef TRANSPOSE_A
pos_a += BK / LOAD_VEC_A;
#endif
pos_b += BK / LOAD_VEC_B;

#ifdef COOPMAT
Expand Down
56 changes: 33 additions & 23 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
#ifdef TRANSPOSE_A
#define QUANT_IDX_A \
const uint _qklva = QUANT_K / LOAD_VEC_A; \
const uint _k_elem = block + row * LOAD_VEC_A; \
const uint idx = pos_a + (_k_elem / QUANT_K) * p.M * _qklva + idx_m * _qklva + ((_k_elem / LOAD_VEC_A) % _qklva);
#else
#define QUANT_IDX_A \
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
#endif

void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uint idx_m, const uint block, const uint end_k) {
#if defined(DATA_A_F32) || defined(DATA_A_F16)
#if LOAD_VEC_A == 8
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
FLOAT_TYPEV8 aa = FLOAT_TYPEV8(data_a[idx]);
buf_a[buf_idx ] = aa[0].xy;
buf_a[buf_idx + 1] = aa[0].zw;
buf_a[buf_idx + 2] = aa[1].xy;
buf_a[buf_idx + 3] = aa[1].zw;
#elif LOAD_VEC_A == 4
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
FLOAT_TYPEV4 aa = FLOAT_TYPEV4(data_a[idx]);
buf_a[buf_idx ] = aa.xy;
Expand All @@ -28,7 +38,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
#endif
#elif defined(DATA_A_BF16)
#if LOAD_VEC_A == 4
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
FLOAT_TYPEV4 aa = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_a[idx]));
buf_a[buf_idx ] = aa.xy;
Expand All @@ -46,7 +56,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
}
#endif
#elif defined(DATA_A_Q4_0)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;

const uint ib = idx / 4;
Expand All @@ -62,7 +72,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx + 8] = FLOAT_TYPEV2(v1.xy);
buf_a[buf_idx + 9] = FLOAT_TYPEV2(v1.zw);
#elif defined(DATA_A_Q4_1)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;

const uint ib = idx / 4;
Expand All @@ -78,7 +88,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx + 8 ] = FLOAT_TYPEV2(v1.xy);
buf_a[buf_idx + 9 ] = FLOAT_TYPEV2(v1.zw);
#elif defined(DATA_A_Q5_0)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;

const uint ib = idx / 8;
Expand All @@ -95,7 +105,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx ] = FLOAT_TYPEV2(v.xz);
buf_a[buf_idx + 8] = FLOAT_TYPEV2(v.yw);
#elif defined(DATA_A_Q5_1)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;

const uint ib = idx / 4;
Expand All @@ -117,7 +127,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx + 8] = FLOAT_TYPEV2(v0.yw);
buf_a[buf_idx + 9] = FLOAT_TYPEV2(v1.yw);
#elif defined(DATA_A_Q8_0)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 8;
Expand Down Expand Up @@ -145,7 +155,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx + 2] = FLOAT_TYPEV2((bits & 0x10u) != 0u ? d : -d, (bits & 0x20u) != 0u ? d : -d);
buf_a[buf_idx + 3] = FLOAT_TYPEV2((bits & 0x40u) != 0u ? d : -d, (bits & 0x80u) != 0u ? d : -d);
#elif defined(DATA_A_Q2_K)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 64; // 4 values per idx
Expand All @@ -164,7 +174,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx ] = FLOAT_TYPEV2(v.xy);
buf_a[buf_idx + 1] = FLOAT_TYPEV2(v.zw);
#elif defined(DATA_A_Q3_K)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 128; // 2 values per idx
Expand All @@ -188,7 +198,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx] = FLOAT_TYPEV2(dl * (qs.x - hm.x),
dl * (qs.y - hm.y));
#elif defined(DATA_A_Q4_K)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 64; // 4 values per idx
Expand Down Expand Up @@ -224,7 +234,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx ] = FLOAT_TYPEV2(fma(d, q.x, m), fma(d, q.y, m));
buf_a[buf_idx + 1] = FLOAT_TYPEV2(fma(d, q.z, m), fma(d, q.w, m));
#elif defined(DATA_A_Q5_K)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 64; // 4 values per idx
Expand Down Expand Up @@ -263,7 +273,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx ] = FLOAT_TYPEV2(fma(d, q.x, m), fma(d, q.y, m));
buf_a[buf_idx + 1] = FLOAT_TYPEV2(fma(d, q.z, m), fma(d, q.w, m));
#elif defined(DATA_A_Q6_K)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 128; // 2 values per idx
Expand All @@ -285,7 +295,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin

buf_a[buf_idx] = FLOAT_TYPEV2(q.x, q.y);
#elif defined(DATA_A_IQ1_S)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 32; // 8 values per idx
Expand All @@ -304,7 +314,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
dl * (bitfieldExtract(grid, 4 * k + 2, 2) + delta));
}
#elif defined(DATA_A_IQ1_M)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 32; // 8 values per idx
Expand All @@ -326,7 +336,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
dl * (bitfieldExtract(grid, 4 * k + 2, 2) + delta));
}
#elif defined(DATA_A_IQ2_XXS)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 32; // 8 values per idx
Expand Down Expand Up @@ -357,7 +367,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx + 3] = db * FLOAT_TYPEV2((sign & 64) != 0 ? -grid1.z : grid1.z,
(sign & 128) != 0 ? -grid1.w : grid1.w);
#elif defined(DATA_A_IQ2_XS)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 32; // 8 values per idx
Expand All @@ -383,7 +393,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx + 3] = db * FLOAT_TYPEV2((sign & 64) != 0 ? -grid1.z : grid1.z,
(sign & 128) != 0 ? -grid1.w : grid1.w);
#elif defined(DATA_A_IQ2_S)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 32; // 8 values per idx
Expand Down Expand Up @@ -411,7 +421,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx + 3] = db * FLOAT_TYPEV2((sign & 64) != 0 ? -grid1.z : grid1.z,
(sign & 128) != 0 ? -grid1.w : grid1.w);
#elif defined(DATA_A_IQ3_XXS)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 64; // 4 values per idx
Expand All @@ -435,7 +445,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx + 1] = FLOAT_TYPEV2((sign & 4) != 0 ? -v.z : v.z,
(sign & 8) != 0 ? -v.w : v.w);
#elif defined(DATA_A_IQ3_S)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 64; // 4 values per idx
Expand All @@ -457,7 +467,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx + 1] = FLOAT_TYPEV2((sign & 4) != 0 ? -v.z : v.z,
(sign & 8) != 0 ? -v.w : v.w);
#elif defined(DATA_A_IQ4_XS)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 64; // 4 values per idx
Expand All @@ -475,7 +485,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx ] = FLOAT_TYPEV2(v.xy);
buf_a[buf_idx + 1] = FLOAT_TYPEV2(v.zw);
#elif defined(DATA_A_IQ4_NL)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;

const uint ib = idx / 8;
Expand All @@ -489,7 +499,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx + 8] = d * FLOAT_TYPEV2(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)],
kvalues_iq4nl[vui >> 12]);
#elif defined(DATA_A_MXFP4)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
QUANT_IDX_A
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;

const uint ib = idx / 8;
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,14 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
}

// TRANSPOSE_A variants
if (!coopmat2 && (tname == "q4_k" || tname == "q5_k" || tname == "q6_k" || tname == "q5_1")) {
string_to_spv(shader_name + "_" + tname + "_f32_transa", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"TRANSPOSE_A", "1"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_" + tname + "_f32_transa_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}, {"TRANSPOSE_A", "1"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_" + tname + "_f16_transa", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"TRANSPOSE_A", "1"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_" + tname + "_f16_transa_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}, {"TRANSPOSE_A", "1"}}), fp16, coopmat, coopmat2, f16acc);
}

#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
// Integer dot mmq performs better with f32 accumulators
if (!f16acc && !coopmat && !coopmat2 && (is_legacy_quant(tname) || is_k_quant(tname) || tname == "mxfp4")) {
Expand Down