From a91900113415ebf3b25e75a8b75fe475876e139a Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Thu, 28 May 2026 04:49:11 -0700
Subject: [PATCH 01/50] hexagon:  minor refresh for HMX FA and MM (#23796)

* hex-fa: clean up qf32/fp32 handling and stride handling

* hex-fa: fix corner case fp NAN issues that were cause bad output from gemma4 on v79

* hex-fa: vectorize leftover handling

* hex-fa: avoid HVX fallback during token gen HMX has more FP16 compute capacity

* hmx-mm: remove dead code

* hmx-mm: use fastdiv in x4x2 dequant

* hmx-mm: sandwich dequant and scatter to improve perf

* hmx-mm: fixed rebase conflicts

* hmx-mm: further improve weight dequant by doing early type dispatch and precomputing fastdiv

* hmx-mm: an even earlier dispatch for per-type dequant

* hmx-mm: dequant linear types like q4_0 and q4_1 without the LUTs

This is a bit faster than LUT.

* hex-cmake: one more tweak for lto

---------

Co-authored-by: Trivikram Reddy <tamarnat@qti.qualcomm.com>
---
 ggml/src/ggml-hexagon/htp/CMakeLists.txt      |   3 +-
 ggml/src/ggml-hexagon/htp/flash-attn-ops.c    | 157 +++---
 .../src/ggml-hexagon/htp/hmx-flash-attn-ops.c |   3 -
 ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c    | 493 ++++++++++--------
 4 files changed, 370 insertions(+), 286 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
index d7927261a85..ff3fc0804e3 100644
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -58,15 +58,16 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
 
 if (_hmx_idx GREATER_EQUAL 0)
     target_sources(${HTP_LIB} PRIVATE
-        hmx-queue.c
         hmx-flash-attn-ops.c
         hmx-matmul-ops.c
+        hmx-queue.c
     )
 
     # -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
     set_source_files_properties(
         hmx-flash-attn-ops.c
         hmx-matmul-ops.c
+        hmx-queue.c
         PROPERTIES COMPILE_OPTIONS "-mhmx"
     )
 
diff --git a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
index d95df6ac9d5..1bd8c1407de 100644
--- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
@@ -22,6 +22,16 @@
 // Must be multiple of 32
 #define FLASH_ATTN_BLOCK_SIZE (32 * 2)
 
+#if __HVX_ARCH__ < 79
+#define HVX_OP_ADD_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b))
+#define HVX_OP_SUB_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b))
+#define HVX_OP_MUL_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b))
+#else
+#define HVX_OP_ADD_F32(a, b) Q6_Vsf_vadd_VsfVsf(a, b)
+#define HVX_OP_SUB_F32(a, b) Q6_Vsf_vsub_VsfVsf(a, b)
+#define HVX_OP_MUL_F32(a, b) Q6_Vsf_vmpy_VsfVsf(a, b)
+#endif
+
 // This is a bit of a hack because the compiler is strugling to properly inline
 // the default hvx_vec_f32_to_f16 with output into the local array.
 static __attribute__((noinline)) void hvx_vec_f32_to_f16_a(void *ptr, HVX_Vector v0, HVX_Vector v1)
@@ -54,8 +64,8 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
         rsum_p = hvx_vec_mpyacc_f32_f16(rsum_p, x_hf, y_hf);
     }
 
-    HVX_Vector rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p)));
-    rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum)));
+    HVX_Vector rsum = HVX_OP_ADD_F32(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p));
+    rsum = HVX_OP_MUL_F32(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
     hvx_vec_store_u(r, 4, rsum);
 }
 
@@ -105,10 +115,10 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx4(const void * restrict y,
         rsum3_p = hvx_vec_mpyacc_f32_f16(rsum3_p, x3_hf, y_hf);
     }
 
-    HVX_Vector rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p)));
-    HVX_Vector rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p)));
-    HVX_Vector rsum2 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum2_p), Q6_V_hi_W(rsum2_p)));
-    HVX_Vector rsum3 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum3_p), Q6_V_hi_W(rsum3_p)));
+    HVX_Vector rsum0 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p));
+    HVX_Vector rsum1 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p));
+    HVX_Vector rsum2 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum2_p), Q6_V_hi_W(rsum2_p));
+    HVX_Vector rsum3 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum3_p), Q6_V_hi_W(rsum3_p));
 
     HVX_Vector_x4 rsum0123 = { .v = { rsum0, rsum1, rsum2, rsum3 } };
     return hvx_vec_reduce_sum_f32x4(rsum0123);
@@ -123,7 +133,7 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx32(const void * restrict y,
     const size_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
     const size_t nloe = n % VLEN_FP16; // leftover elements
 
-    HVX_Vector   sums;  // initialize at j = 0
+    HVX_Vector   sums = Q6_V_vzero();
     const size_t stride_x_4 = stride_x * 4;
     for (uint32_t j = 0; j < VLEN_FP32; j += 4) {
         HVX_Vector     sums_x4 = hvx_dot_f16_f16_aa_rx4(y, x, stride_x, nvec, nloe);
@@ -132,8 +142,7 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx32(const void * restrict y,
         x += stride_x_4;
     }
 
-    sums = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), sums);
-    return Q6_Vsf_equals_Vqf32(sums);
+    return HVX_OP_MUL_F32(hvx_vec_splat_f32(s), sums);
 }
 
 // MAD: y (F32) += x (F16) * s (F16)
@@ -268,11 +277,10 @@ static inline void hvx_scale_vec_f32_aa(uint8_t * restrict dst, const uint8_t *
     uint32_t i = 0;
     #pragma unroll(4)
     for (; i < nvec; ++i) {
-        vdst[i] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs));
+        vdst[i] = HVX_OP_MUL_F32(vsrc[i], vs);
     }
     if (nloe) {
-        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
-        hvx_vec_store_a(&vdst[i], nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v));
+        hvx_vec_store_a(&vdst[i], nloe * sizeof(float), HVX_OP_MUL_F32(vsrc[i], vs));
     }
 }
 
@@ -438,25 +446,44 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
             // Process in sub-blocks of 32 (VLEN_FP32)
             HVX_Vector sb_scores[FLASH_ATTN_BLOCK_SIZE / VLEN_FP32];
             HVX_Vector v_max = hvx_vec_splat_f32(-INFINITY);
-            for (uint32_t iv = 0; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32, ++iv) {
+            for (uint32_t iv = 0; ic < current_block_size; ic += VLEN_FP32, ++iv) {
                 // 1. Compute scores
                 HVX_Vector scores = hvx_dot_f16_f16_aa_rx32(q_ptr_vtcm, k_base + ic * factx->size_k_row_padded, factx->size_k_row_padded, DK, factx->scale);
 
                 // 2. Softcap
                 if (factx->logit_softcap != 0.0f) {
                     scores = hvx_vec_tanh_f32(scores);
-                    scores = Q6_Vqf32_vmpy_VsfVsf(scores, logit_cap);
-                    scores = Q6_Vsf_equals_Vqf32(scores);
+                    scores = HVX_OP_MUL_F32(scores, logit_cap);
                 }
 
                 // 3. Mask
                 if (mask) {
                     const __fp16 * mp = m_base + ic;
                     HVX_Vector m_vals_f16 = *(const HVX_UVector *) mp;
-                    HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec);
-                    HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair);
-                    scores = Q6_Vqf32_vadd_Vqf32Vsf(add_val, scores);
-                    scores = Q6_Vsf_equals_Vqf32(scores);
+
+                    // Multiplying -INFINITY (0xFC00) by a slope in VhfVhf instructions can incorrectly produce NaN on v79.
+                    // Clamp -INFINITY to the max negative fp16 finite value (-65504.0f).
+                    HVX_Vector vinf = Q6_Vh_vsplat_R(0xFC00);
+                    HVX_Vector vmin = Q6_Vh_vsplat_R(0xFBFF);
+                    HVX_VectorPred is_inf = Q6_Q_vcmp_eq_VhVh(m_vals_f16, vinf);
+                    m_vals_f16 = Q6_V_vmux_QVV(is_inf, vmin, m_vals_f16);
+
+                    #if __HVX_ARCH__ >= 79
+                        HVX_VectorPair m_vals_f32_pair = Q6_Wsf_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec);
+                        HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair);
+                        scores = Q6_Vsf_vadd_VsfVsf(add_val, scores);
+                    #else
+                        HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec);
+                        HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair);
+                        scores = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(add_val, scores));
+                    #endif
+                }
+
+                // Mask out invalid lanes for leftover handling
+                uint32_t valid_lanes = current_block_size - ic;
+                if (valid_lanes < VLEN_FP32) {
+                    HVX_VectorPred valid_pred = Q6_Q_vsetq_R(valid_lanes * 4); // 4 bytes per fp32 lane
+                    scores = Q6_V_vmux_QVV(valid_pred, scores, hvx_vec_splat_f32(-INFINITY));
                 }
 
                 sb_scores[iv] = scores;
@@ -466,78 +493,55 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
             {
                 // 4. Online Softmax Update
                 HVX_Vector M_new_vec = Q6_Vsf_vmax_VsfVsf(v_max, M_vec);
-                HVX_Vector diff_vec  = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(M_vec, M_new_vec));
+                HVX_Vector diff_vec  = HVX_OP_SUB_F32(M_vec, M_new_vec);
                 HVX_Vector ms_vec    = hvx_vec_exp_f32(diff_vec);
                 M_vec = M_new_vec;
 
                 hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec);
 
                 HVX_Vector p_sum_vec = hvx_vec_splat_f32(0.0f);
-                for (uint32_t ic2 = 0, iv = 0; ic2 + VLEN_FP32 <= current_block_size; ic2 += VLEN_FP32, ++iv) {
+                for (uint32_t ic2 = 0, iv = 0; ic2 < current_block_size; ic2 += VLEN_FP32, ++iv) {
                     HVX_Vector scores = sb_scores[iv];
-                    HVX_Vector scores_shifted = Q6_Vqf32_vsub_VsfVsf(scores, M_vec);
-                    HVX_Vector P = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(scores_shifted));
+                    HVX_Vector scores_shifted = HVX_OP_SUB_F32(scores, M_vec);
+                    HVX_Vector P = hvx_vec_exp_f32(scores_shifted);
 
-                    p_sum_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(p_sum_vec, P));
+                    p_sum_vec = HVX_OP_ADD_F32(p_sum_vec, P);
 
                     // 5. Accumulate V
                     __fp16 __attribute__((aligned(VLEN))) p_arr[VLEN_FP16];
                     hvx_vec_f32_to_f16_a(p_arr, P, hvx_vec_splat_f32(0));
 
+                    float __attribute__((aligned(128))) P_arr[VLEN_FP32];
+                    hvx_vec_store_a(P_arr, 128, P);
+
                     for (uint32_t j = 0; j < VLEN_FP32; j += 2) {
-                        const uint32_t  cur_ic = ic2 + j;
-                        const uint8_t * v_ptr  = v_base + cur_ic * factx->size_v_row_padded;
+                        const uint32_t cur_ic = ic2 + j;
+                        if (cur_ic >= current_block_size) {
+                            break;
+                        }
+
+                        if (cur_ic + 1 == current_block_size) {
+                            // Odd leftover, process single row
+                            if (P_arr[j] != 0.0f) {
+                                const uint8_t * v_ptr = v_base + cur_ic * factx->size_v_row_padded;
+                                hvx_mad_f32_f16_aa(VKQ32, v_ptr, (p_arr + j), DV);
+                            }
+                            break;
+                        }
+
+                        // Avoid NaN * 0.0 = NaN for uninitialized V cache rows.
+                        // Check the f32 values to safely avoid strict aliasing violations.
+                        if (P_arr[j] == 0.0f && P_arr[j + 1] == 0.0f) {
+                            continue;
+                        }
+
+                        const uint8_t * v_ptr = v_base + cur_ic * factx->size_v_row_padded;
                         hvx_mad_f32_f16_aa_rx2(VKQ32, v_ptr, v_ptr + factx->size_v_row_padded, (p_arr + j), (p_arr + j + 1), DV);
                     }
                 }
 
                 p_sum_vec = hvx_vec_reduce_sum_f32(p_sum_vec);
-                S_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(S_vec, ms_vec)), p_sum_vec));
-            }
-
-            if (ic < current_block_size) {
-                // Sync scalars for leftover/next block if needed
-                float M = hvx_vec_get_f32(M_vec);
-                float S = hvx_vec_get_f32(S_vec);
-
-                // Leftover
-                for (; ic < current_block_size; ++ic) {
-                    float s_val;
-                    const uint8_t * k_ptr = k_base + ic * factx->size_k_row_padded;
-                    hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, factx->scale);
-                    if (factx->logit_softcap != 0.0f) {
-                        s_val = factx->logit_softcap * tanhf(s_val);
-                    }
-
-                    if (mask) {
-                        const float m_val = m_base[ic];
-                        s_val += slope * m_val;
-                    }
-
-                    const float Mold = M;
-                    __fp16 vs = 1.0f;
-
-                    if (s_val > M) {
-                        M = s_val;
-                        HVX_Vector diff_vec = hvx_vec_splat_f32(Mold - M);
-                        HVX_Vector ms_vec   = hvx_vec_exp_f32(diff_vec);
-                        hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec);
-
-                        float ms = hvx_vec_get_f32(ms_vec);
-                        S = S * ms + vs;
-                    } else {
-                        HVX_Vector diff_vec = hvx_vec_splat_f32(s_val - M);
-                        vs = hvx_vec_get_f32(hvx_vec_exp_f32(diff_vec));
-                        S += vs;
-                    }
-
-                    const uint8_t * v_ptr = v_base + ic * factx->size_v_row_padded;
-
-                    hvx_mad_f32_f16_aa(VKQ32, v_ptr, &vs, DV);
-                }
-
-                M_vec = hvx_vec_splat_f32(M);
-                S_vec = hvx_vec_splat_f32(S);
+                S_vec = HVX_OP_ADD_F32(HVX_OP_MUL_F32(S_vec, ms_vec), p_sum_vec);
             }
 
             // Issue DMA for next+1 block (if exists)
@@ -599,8 +603,9 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
         const int i2 = iq2;
         const int i3 = iq3;
 
-        // dst is permuted
-        uint8_t * dst_ptr = (uint8_t *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1) * nb1;
+        // dst is permuted: [DV, n_heads, n_tokens, n_seq]
+        // head stride is nb[1], token stride is nb[2], batch stride is nb[3]
+        uint8_t * dst_ptr = (uint8_t *) dst->data + i2 * dst->nb[1] + i1 * dst->nb[2] + i3 * dst->nb[3];
 
         if (dst->type == HTP_TYPE_F32) {
             hvx_copy_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
@@ -623,8 +628,8 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
     }
 
 #ifdef HTP_HAS_HMX
-    // HMX path: prefill (neq1 >= 32), head_dim multiple of 32, F16 KV
-    if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 32 == 0 && q->ne[1] >= 32) {
+    // HMX path: head_dim multiple of 32, F16 KV
+    if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 32 == 0) {
         int ret = hmx_flash_attn_ext(octx);
         if (ret == HTP_STATUS_OK) {
             return ret;
diff --git a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
index a496f6289ae..f132c08500d 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
@@ -1248,9 +1248,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
     if (DK % 32 != 0 || DV % 32 != 0) {
         return HTP_STATUS_NO_SUPPORT;
     }
-    if (neq1 < 32) {
-        return HTP_STATUS_NO_SUPPORT;
-    }
 
     // GQA factor
     const uint32_t n_kv_heads = k->ne[2];
diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
index ab5fd73380b..083d125882d 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -16,6 +16,7 @@
 #include "ggml-common.h"
 
 #include "hex-dma.h"
+#include "hex-fastdiv.h"
 #include "worker-pool.h"
 
 #include "hvx-utils.h"
@@ -187,45 +188,44 @@ static int hmx_compute_chunks(size_t   vtcm_total,
 // In x4x2, sub-blocks 0..3 use lower nibbles, sub-blocks 4..7 use upper nibbles
 // of the same 32 packed bytes.
 static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) {
+    (void)vlut_cvt;
     HVX_Vector vq = hvx_vmemu(packed_32);
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
     HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale));
-    // q4x4x2 stores two int4 values per byte. Keep only the selected nibble.
-    HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+
+    HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
     v_quants = Q6_V_vand_VV(v_quants, mask_h4);
-    // Shuffle before LUT
-    v_quants = Q6_Vb_vshuff_Vb(v_quants);
-    // Use standard vlut16 (not _nomatch) to avoid stale-register NaN.
-    // _nomatch retains the previous destination-register value for colliding
-    // indices, but the C intrinsic doesn't model the implicit read so the
-    // compiler may allocate a register containing garbage/NaN.
-    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
-    HVX_Vector v_hf = Q6_V_lo_W(vp);
+
+    HVX_Vector v_int8 = Q6_Vb_vsub_VbVb(v_quants, i8);
+    HVX_Vector v0     = Q6_V_lo_W(Q6_Wh_vunpack_Vb(v_int8));
+    HVX_Vector v_hf   = Q6_Vhf_equals_Vh(v0);
 
     return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
 }
 
 // Batch-dequantize 4 contiguous x4x2 Q4_0 groups (4x32 = 128 packed bytes) using
-// full HVX vector width.  One vmemu + one vlut16 replaces 4 separate calls.
+// full HVX vector width.
 // Output: vector_x2 each hold 32 FP16 values in the first 64 bytes.
 static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx(
             const uint8_t *packed_128, bool upper_nibbles,
             const __fp16 *scales_4, const HVX_Vector vlut_cvt) {
-    // Load all 128 packed bytes (4 contiguous 32-byte groups)
+    (void)vlut_cvt;
     HVX_Vector vq = hvx_vmemu(packed_128);
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
     HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
     v_quants = Q6_V_vand_VV(v_quants, mask_h4);
 
-    // Shuffle before LUT
-    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+    HVX_Vector v_int8 = Q6_Vb_vsub_VbVb(v_quants, i8);
 
-    // Full-width vlut16: 128 byte lookups -> 128 fp16 results in a VectorPair
-    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
-    HVX_Vector v_lo = Q6_V_lo_W(vp);  // [group0: 32 fp16 | group1: 32 fp16]
-    HVX_Vector v_hi = Q6_V_hi_W(vp);  // [group2: 32 fp16 | group3: 32 fp16]
+    HVX_VectorPair vp_int16 = Q6_Wh_vunpack_Vb(v_int8);
+    HVX_Vector v_lo = Q6_V_lo_W(vp_int16);
+    HVX_Vector v_hi = Q6_V_hi_W(vp_int16);
+
+    v_lo = Q6_Vhf_equals_Vh(v_lo);
+    v_hi = Q6_Vhf_equals_Vh(v_hi);
 
-    // Build per-group scale vectors: first 64 bytes use scale_a, last 64 use scale_b
     HVX_Vector vscale = hvx_vmemu(scales_4);
     HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
     HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));
@@ -233,13 +233,12 @@ static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx(
     v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
     v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
 
-    // Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter
-    HVX_Vector_x2 r = { v_lo,/* group1 already in [0:63] */
-                        v_hi /* group2 already in [0:63] */ };
+    HVX_Vector_x2 r = { v_lo, v_hi };
     return r;
 }
 
 static inline HVX_Vector dequantize_x4x2_q4_1_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale_offset, const HVX_Vector vlut_cvt) {
+    (void)vlut_cvt;
     HVX_Vector vq = hvx_vmemu(packed_32);
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
     HVX_Vector v_dm = hvx_vmemu(scale_offset);
@@ -248,9 +247,9 @@ static inline HVX_Vector dequantize_x4x2_q4_1_group_hvx(const uint8_t *packed_32
 
     HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
     v_quants = Q6_V_vand_VV(v_quants, mask_h4);
-    v_quants = Q6_Vb_vshuff_Vb(v_quants);
-    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
-    HVX_Vector v_hf = Q6_V_lo_W(vp);
+
+    HVX_Vector v0   = Q6_V_lo_W(Q6_Wh_vunpack_Vb(v_quants));
+    HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0);
 
     return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales), v_offsets));
 }
@@ -258,16 +257,18 @@ static inline HVX_Vector dequantize_x4x2_q4_1_group_hvx(const uint8_t *packed_32
 static inline HVX_Vector_x2 dequantize_x4x2_q4_1_x4groups_hvx(
             const uint8_t *packed_128, bool upper_nibbles,
             const __fp16 *scales_offsets_4, const HVX_Vector vlut_cvt) {
+    (void)vlut_cvt;
     HVX_Vector vq = hvx_vmemu(packed_128);
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
     HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
     v_quants = Q6_V_vand_VV(v_quants, mask_h4);
 
-    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+    HVX_VectorPair vp_int16 = Q6_Wh_vunpack_Vb(v_quants);
+    HVX_Vector v_lo = Q6_V_lo_W(vp_int16);
+    HVX_Vector v_hi = Q6_V_hi_W(vp_int16);
 
-    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
-    HVX_Vector v_lo = Q6_V_lo_W(vp);
-    HVX_Vector v_hi = Q6_V_hi_W(vp);
+    v_lo = Q6_Vhf_equals_Vh(v_lo);
+    v_hi = Q6_Vhf_equals_Vh(v_hi);
 
     HVX_Vector vscale_offset = hvx_vmemu(scales_offsets_4);
     HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(vscale_offset, vscale_offset, -2);
@@ -287,6 +288,45 @@ static inline HVX_Vector_x2 dequantize_x4x2_q4_1_x4groups_hvx(
     return r;
 }
 
+// LUT-based dequantizers for non-linear IQ4_NL format.
+static inline HVX_Vector dequantize_x4x2_iq4_nl_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) {
+    HVX_Vector vq = hvx_vmemu(packed_32);
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale));
+    HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
+    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
+    HVX_Vector v_hf = Q6_V_lo_W(vp);
+
+    return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
+}
+
+static inline HVX_Vector_x2 dequantize_x4x2_iq4_nl_x4groups_hvx(
+            const uint8_t *packed_128, bool upper_nibbles,
+            const __fp16 *scales_4, const HVX_Vector vlut_cvt) {
+    HVX_Vector vq = hvx_vmemu(packed_128);
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
+
+    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+
+    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
+    HVX_Vector v_lo = Q6_V_lo_W(vp);
+    HVX_Vector v_hi = Q6_V_hi_W(vp);
+
+    HVX_Vector vscale = hvx_vmemu(scales_4);
+    HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
+    HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));
+
+    v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
+    v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
+
+    HVX_Vector_x2 r = { v_lo, v_hi };
+    return r;
+}
+
 // Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes.
 static inline HVX_Vector dequantize_x4x2_q8_0_group_hvx(const int8_t *quants_32, const __fp16 *scale) {
     HVX_Vector vq       = hvx_vmemu(quants_32);
@@ -374,122 +414,176 @@ static inline HVX_Vector_x4 dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t *
     return r;
 }
 
+typedef struct {
+    __fp16                  *dst;
+    const uint8_t           *src;
+    int                      n_cols;
+    int                      k_block;
+    size_t                   row_stride;
+    int                      weight_type;
+    int                      n_tot_tiles;
+    int                      n_tiles_per_task;
+    int                      n_tasks;
+    int                      n_k_tiles;
+    struct fastdiv_values    n_k_tiles_div;
+} x4x2_dequantize_state_t;
+
 // Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16.
 // Input:  vtcm_src has n_cols rows of x4x2 data, each row_stride bytes.
 // Output: vtcm_dst in tile-major FP16 layout.
-static void dequantize_x4x2_weight_to_fp16_tiles_task(
-        __fp16 *restrict vtcm_dst,
-        const uint8_t *restrict vtcm_src,
-        int n_cols, int k_block,
-        size_t row_stride, int weight_type,
-        int start_tile, int end_tile) {
-
-    const int n_k_tiles = (unsigned)k_block / HMX_FP16_TILE_N_COLS;
-    const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_Q4_1 || weight_type == HTP_TYPE_IQ4_NL);
-    const bool is_q4_1 = (weight_type == HTP_TYPE_Q4_1);
-    const int qrow_size = is_q4 ? ((unsigned)k_block / 2) : k_block;
-
-    const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
-                                (weight_type == HTP_TYPE_MXFP4)  ? hvx_vmem(mxfp4_to_fp16_lut) :
-                                (weight_type == HTP_TYPE_Q4_1)   ? hvx_vmem(q4_1_to_fp16_lut) :
-                                                                   hvx_vmem(q4_0_to_fp16_lut);
 
-    // vscatter setup: write dequantized K-values directly to transposed [K][N] tile positions.
-    // Each int32 element holds a K-row-pair (2 adjacent fp16 values).  word[i] at offset i*128
-    // maps to K-rows 2i and 2i+1.  Column offset (n*4) added per row.
-    const HVX_Vector v_scat_base  = hvx_vmem(hmx_transpose_scatter_offsets);
-    const HVX_Vector v_scat_step  = Q6_V_vsplat_R(4);  // 4 bytes = 1 column step
-    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);  // first 16 words (64 bytes)
-
-    unsigned ct = (unsigned)start_tile / n_k_tiles;  // column tile index
-    unsigned kt = (unsigned)start_tile % n_k_tiles;  // K tile index
-    for (unsigned t = start_tile; t < end_tile; ) {
-        if (kt >= n_k_tiles) { kt = 0; ct++; }
-
-        // --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row ---
-        if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
-            unsigned blk_idx      = (kt * 32) / QK_Q4_0x4x2;
-            unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32;  // 0 or 4
-            bool upper            = (sub_blk_base >= 4);
-            unsigned packed_off   = blk_idx * (QK_Q4_0x4x2 / 2);     // 128 contiguous packed bytes
-            unsigned dblk_size    = is_q4_1 ? 32 : HMX_X4X2_DBLK_SIZE;
-            unsigned scale_step   = is_q4_1 ? 4 : (int)sizeof(__fp16);
-            unsigned scale_off    = qrow_size + blk_idx * dblk_size
-                                  + sub_blk_base * scale_step;
-
-            __fp16 *tile_bases[4];
-            for (unsigned g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
-
-            HVX_Vector v_off = v_scat_base;
-
-            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
-            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
-
-            if (is_q4_1) {
-                for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                    const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                    const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
-
-                    HVX_Vector_x2 dv0 = dequantize_x4x2_q4_1_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
-                    HVX_Vector_x2 dv1 = dequantize_x4x2_q4_1_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);
+#define DEFINE_DEQUANTIZE_Q4_TASK(suffix, lut_name, helper_prefix, dblk_size, scale_step)                      \
+static void dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(                                                \
+        const x4x2_dequantize_state_t *state,                                                                  \
+        int start_tile, int end_tile) {                                                                        \
+                                                                                                               \
+    const int n_k_tiles = state->n_k_tiles;                                                                    \
+    const int qrow_size = (unsigned)state->k_block / 2;                                                        \
+    const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div;                                          \
+    const HVX_Vector vlut_cvt = hvx_vmem(lut_name);                                                            \
+                                                                                                               \
+    const HVX_Vector v_scat_base  = hvx_vmem(hmx_transpose_scatter_offsets);                                   \
+    const HVX_Vector v_scat_step  = Q6_V_vsplat_R(4);                                                          \
+    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);                                                          \
+                                                                                                               \
+    unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div);                                               \
+    unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div);                                 \
+                                                                                                               \
+    for (unsigned t = start_tile; t < (unsigned)end_tile; ) {                                                  \
+        if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; }                                                       \
+                                                                                                               \
+        if ((kt % 4 == 0) && (t + 4 <= (unsigned)end_tile) && (fastdiv(t + 3, &n_k_tiles_div) == ct)) {        \
+            unsigned blk_idx      = ((kt * 32) / QK_Q4_0x4x2);                                                 \
+            unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32;                                            \
+            bool upper            = (sub_blk_base >= 4);                                                       \
+            unsigned packed_off   = blk_idx * (QK_Q4_0x4x2 / 2);                                               \
+            unsigned scale_off    = qrow_size + blk_idx * (dblk_size) + sub_blk_base * (scale_step);           \
+                                                                                                               \
+            __fp16 *tile_bases[4];                                                                             \
+            for (unsigned g = 0; g < 4; g++) {                                                                 \
+                tile_bases[g] = state->dst + (t + g) * HMX_FP16_TILE_N_ELMS;                                   \
+            }                                                                                                  \
+                                                                                                               \
+            HVX_Vector v_off = v_scat_base;                                                                    \
+            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * state->row_stride;                               \
+                                                                                                               \
+            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {                                                \
+                const uint8_t *r0 = state->src + row_offset; row_offset += state->row_stride;                  \
+                const uint8_t *r1 = state->src + row_offset; row_offset += state->row_stride;                  \
+                                                                                                               \
+                HVX_Vector_x2 dv0 = dequantize_x4x2_##helper_prefix##_x4groups_hvx(                            \
+                    r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);                       \
+                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);         \
+                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);         \
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);                                                   \
+                                                                                                               \
+                HVX_Vector_x2 dv1 = dequantize_x4x2_##helper_prefix##_x4groups_hvx(                            \
+                    r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);                       \
+                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);         \
+                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);         \
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);                                                   \
+            }                                                                                                  \
+                                                                                                               \
+            for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }                    \
+            t += 4; kt += 4;                                                                                   \
+            continue;                                                                                          \
+        }                                                                                                      \
+                                                                                                               \
+        __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS;                                             \
+        {                                                                                                      \
+            unsigned blk_idx   = (kt * 32) / QK_Q4_0x4x2;                                                      \
+            unsigned sub_blk   = ((kt * 32) % QK_Q4_0x4x2) / 32;                                               \
+            bool upper         = (sub_blk >= 4);                                                               \
+            unsigned byte_off  = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;         \
+            unsigned scale_off = qrow_size + blk_idx * (dblk_size) + sub_blk * (scale_step);                   \
+                                                                                                               \
+            HVX_Vector v_off = v_scat_base;                                                                    \
+            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * state->row_stride;                               \
+            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;                                                     \
+                                                                                                               \
+            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {                                     \
+                const uint8_t *r0 = state->src + row_offset; row_offset += state->row_stride;                  \
+                const uint8_t *r1 = state->src + row_offset; row_offset += state->row_stride;                  \
+                                                                                                               \
+                HVX_Vector v0 = dequantize_x4x2_##helper_prefix##_group_hvx(                                   \
+                    r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);                         \
+                HVX_Vector v1 = (row1 < (unsigned)state->n_cols)                                               \
+                    ? dequantize_x4x2_##helper_prefix##_group_hvx(                                             \
+                        r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt)                      \
+                    : Q6_V_vzero();                                                                            \
+                                                                                                               \
+                Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);            \
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);                                                   \
+                Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);            \
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);                                                   \
+            }                                                                                                  \
+            (void) *(volatile HVX_Vector *)(tile_base);                                                        \
+        }                                                                                                      \
+        ++t; ++kt;                                                                                             \
+    }                                                                                                          \
+                                                                                                               \
+    if (start_tile < end_tile) {                                                                               \
+        (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);                   \
+    }                                                                                                          \
+}                                                                                                              \
+                                                                                                               \
+static void dequantize_x4x2_worker_loop_##suffix(unsigned int n, unsigned int i, void *data) {                 \
+    x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;                                          \
+    for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {                     \
+        int start = task_id * state->n_tiles_per_task;                                                         \
+        int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);                             \
+        dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(state, start, end);                                 \
+    }                                                                                                          \
+}
 
-                    Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);
-                    Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+DEFINE_DEQUANTIZE_Q4_TASK(q4_0,   q4_0_to_fp16_lut,   q4_0, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16))
+DEFINE_DEQUANTIZE_Q4_TASK(q4_1,   q4_1_to_fp16_lut,   q4_1, 32, 4)
+DEFINE_DEQUANTIZE_Q4_TASK(iq4_nl, iq4_nl_to_fp16_lut, iq4_nl, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16))
 
-                    Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);
-                    Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                }
-            } else {
-                for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                    const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                    const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
+static void dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(
+        const x4x2_dequantize_state_t *state,
+        int start_tile, int end_tile) {
 
-                    HVX_Vector_x2 dv0 = dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
-                    HVX_Vector_x2 dv1 = dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);
+    const int n_k_tiles = state->n_k_tiles;
+    const int qrow_size = state->k_block;
+    const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div;
+    const HVX_Vector vlut_cvt = hvx_vmem(mxfp4_to_fp16_lut);
 
-                    Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);
-                    Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+    const HVX_Vector v_scat_base  = hvx_vmem(hmx_transpose_scatter_offsets);
+    const HVX_Vector v_scat_step  = Q6_V_vsplat_R(4);
+    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);
 
-                    Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);
-                    Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                }
-            }
+    unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div);
+    unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div);
 
-            for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }
-            t += 4; kt += 4;
-            continue;
-        }
+    for (unsigned t = start_tile; t < (unsigned)end_tile; ) {
+        if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; }
 
-        // --- Batch-4 fast path for MXFP4: same nibble layout but E8M0 scales ---
-        if (weight_type == HTP_TYPE_MXFP4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
+        // Batch-4 fast path for MXFP4
+        if ((kt % 4 == 0) && (t + 4 <= (unsigned)end_tile) && (fastdiv(t + 3, &n_k_tiles_div) == ct)) {
             int  blk_idx      = (kt * 32) / QK_MXFP4x4x2;
-            int  sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32;                 // 0 or 4
+            int  sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32;
             bool upper        = (sub_blk_base >= 4);
-            int  packed_off   = blk_idx * (QK_MXFP4x4x2 / 2);                    // 128 contiguous packed bytes
-            int  e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE;  // all 8 E8M0 scales
+            int  packed_off   = blk_idx * (QK_MXFP4x4x2 / 2);
+            int  e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE;
 
             __fp16 * tile_bases[4];
             for (int g = 0; g < 4; g++) {
-                tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS;
+                tile_bases[g] = state->dst + (t + g) * HMX_FP16_TILE_N_ELMS;
             }
 
             HVX_Vector v_off = v_scat_base;
             for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
                 int             row0 = ct * HMX_FP16_TILE_N_COLS + r;
                 int             row1 = row0 + 1;
-                const uint8_t * r0   = vtcm_src + row0 * row_stride;
-                const uint8_t * r1   = vtcm_src + row1 * row_stride;
+                const uint8_t * r0   = state->src + row0 * state->row_stride;
+                const uint8_t * r1   = state->src + row1 * state->row_stride;
 
-                // Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
                 mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
 
                 HVX_Vector_x4 dv0, dv1;
                 dv0 = dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8);
-                if (row1 < n_cols) {
+                if (row1 < state->n_cols) {
                     mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
                     dv1 = dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8);
                 } else {
@@ -510,58 +604,13 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                 (void) *(volatile HVX_Vector *) (tile_bases[g]);
             }
 
-            t += 4;
+            t += 4; kt += 4;
             continue;
         }
 
-        // --- Single-tile fallback ---
-        __fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS;
-
-        if (is_q4) {
-            unsigned blk_idx   = (kt * 32) / QK_Q4_0x4x2;
-            unsigned sub_blk   = ((kt * 32) % QK_Q4_0x4x2) / 32;
-            bool upper         = (sub_blk >= 4);
-            unsigned byte_off  = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
-            unsigned dblk_size = is_q4_1 ? 32 : HMX_X4X2_DBLK_SIZE;
-            unsigned scale_step = is_q4_1 ? 4 : (int)sizeof(__fp16);
-            unsigned scale_off = qrow_size + blk_idx * dblk_size + sub_blk * scale_step;
-
-            HVX_Vector v_off = v_scat_base;  // reset to column 0
-            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
-            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
-            if (is_q4_1) {
-                for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                    const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                    const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
-
-                    HVX_Vector v0 = dequantize_x4x2_q4_1_group_hvx(r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
-                    HVX_Vector v1 = (row1 < n_cols)
-                        ? dequantize_x4x2_q4_1_group_hvx(r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt)
-                        : Q6_V_vzero();
-
-                    Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                    Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                }
-            } else {
-                for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                    const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                    const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
-
-                    HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx(r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
-                    HVX_Vector v1 = (row1 < n_cols)
-                        ? dequantize_x4x2_q4_0_group_hvx(r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt)
-                        : Q6_V_vzero();
-
-                    Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                    Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                }
-            }
-            (void) *(volatile HVX_Vector *)(tile_base);
-        } else if (weight_type == HTP_TYPE_MXFP4) {
+        // Single-tile fallback
+        __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS;
+        {
             int  blk_idx      = (kt * 32) / QK_MXFP4x4x2;
             int  sub_blk      = ((kt * 32) % QK_MXFP4x4x2) / 32;
             bool upper        = (sub_blk >= 4);
@@ -573,15 +622,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                 int row0 = ct * HMX_FP16_TILE_N_COLS + r;
                 int row1 = row0 + 1;
 
-                const uint8_t * r0 = vtcm_src + row0 * row_stride;
-                const uint8_t * r1 = vtcm_src + row1 * row_stride;
+                const uint8_t * r0 = state->src + row0 * state->row_stride;
+                const uint8_t * r1 = state->src + row1 * state->row_stride;
 
-                // Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
                 mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
 
                 HVX_Vector v0 = dequantize_x4x2_mxfp4_group_hvx(r0 + byte_off, upper, sub_blk, vlut_cvt, r0_e8);
                 HVX_Vector v1;
-                if (row1 < n_cols) {
+                if (row1 < state->n_cols) {
                     mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
                     v1 = dequantize_x4x2_mxfp4_group_hvx(r1 + byte_off, upper, sub_blk, vlut_cvt, r1_e8);
                 } else {
@@ -594,23 +642,59 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                 v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
             }
             (void) *(volatile HVX_Vector *) (tile_base);
-        } else {
-            // Q8_0
+        }
+        ++t; ++kt;
+    }
+
+    if (start_tile < end_tile) {
+        (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);
+    }
+}
+
+static void dequantize_x4x2_worker_loop_mxfp4(unsigned int n, unsigned int i, void *data) {
+    x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
+    for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
+        int start = task_id * state->n_tiles_per_task;
+        int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
+        dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(state, start, end);
+    }
+}
+
+static void dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(
+        const x4x2_dequantize_state_t *state,
+        int start_tile, int end_tile) {
+
+    const int n_k_tiles = state->n_k_tiles;
+    const int qrow_size = state->k_block;
+    const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div;
+
+    const HVX_Vector v_scat_base  = hvx_vmem(hmx_transpose_scatter_offsets);
+    const HVX_Vector v_scat_step  = Q6_V_vsplat_R(4);
+    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);
+
+    unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div);
+    unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div);
+
+    for (unsigned t = start_tile; t < (unsigned)end_tile; ) {
+        if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; }
+
+        __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS;
+        {
             int blk_idx  = (kt * 32) / QK_Q8_0x4x2;
             int sub_blk  = ((kt * 32) % QK_Q8_0x4x2) / 32;
             int byte_off  = blk_idx * QK_Q8_0x4x2 + sub_blk * 32;
             int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
 
-            HVX_Vector v_off = v_scat_base;  // reset to column 0
+            HVX_Vector v_off = v_scat_base;
             for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
                 int row0 = ct * HMX_FP16_TILE_N_COLS + r;
                 int row1 = row0 + 1;
 
-                const uint8_t *r0 = vtcm_src + row0 * row_stride;
-                const uint8_t *r1 = vtcm_src + row1 * row_stride;
+                const uint8_t *r0 = state->src + row0 * state->row_stride;
+                const uint8_t *r1 = state->src + row1 * state->row_stride;
 
                 HVX_Vector v0 = dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r0 + byte_off), (const __fp16 *)(r0 + scale_off));
-                HVX_Vector v1 = (row1 < n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero();
+                HVX_Vector v1 = (row1 < state->n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero();
 
                 Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
                 v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
@@ -622,50 +706,31 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
         ++t; ++kt;
     }
 
-    // Drain HVX scatter write buffer: a vmem load on the same HW thread retires
-    // all pending scatter entries to VTCM.  Without this, the main thread's HMX
-    // reads may see stale data because atomic_fetch_sub (release) only orders
-    // regular stores, not the HVX scatter buffer.
     if (start_tile < end_tile) {
-        (void) *(volatile HVX_Vector *)(vtcm_dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);
+        (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);
     }
 }
 
-typedef struct {
-    __fp16        *dst;
-    const uint8_t *src;
-    int            n_cols;
-    int            k_block;
-    size_t         row_stride;
-    int            weight_type;
-    int            n_tot_tiles;
-    int            n_tiles_per_task;
-    int            n_tasks;
-} x4x2_dequantize_state_t;
-
-static void dequantize_x4x2_worker_loop(unsigned int n, unsigned int i, void *data) {
+static void dequantize_x4x2_worker_loop_q8_0(unsigned int n, unsigned int i, void *data) {
     x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
-
     for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
         int start = task_id * state->n_tiles_per_task;
         int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
-
-        dequantize_x4x2_weight_to_fp16_tiles_task(
-            state->dst, state->src, state->n_cols, state->k_block,
-            state->row_stride, state->weight_type, start, end);
+        dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(state, start, end);
     }
 }
 
 static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
         struct htp_context *ctx, __fp16 *vtcm_dst,
         const void *vtcm_src, int n_cols, int k_block,
-        size_t row_stride, int weight_type) {
+        size_t row_stride, int weight_type,
+        int n_k_tiles, struct fastdiv_values n_k_tiles_div,
+        worker_callback_t dequant_worker_fn) {
 
     assert(n_cols  % HMX_FP16_TILE_N_COLS == 0);
     assert(k_block % HMX_FP16_TILE_N_COLS == 0);
 
     size_t n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
-    size_t n_k_tiles   = k_block / HMX_FP16_TILE_N_COLS;
     size_t n_tot_tiles = n_col_tiles * n_k_tiles;
 
     size_t n_tiles_per_task = hmx_ceil_div(n_tot_tiles, ctx->n_threads);
@@ -680,8 +745,10 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
     state.k_block          = k_block;
     state.row_stride       = row_stride;
     state.weight_type      = weight_type;
+    state.n_k_tiles        = n_k_tiles;
+    state.n_k_tiles_div    = n_k_tiles_div;
 
-    worker_pool_run_func(ctx->worker_pool, dequantize_x4x2_worker_loop, &state, ctx->n_threads);
+    worker_pool_run_func(ctx->worker_pool, dequant_worker_fn, &state, ctx->n_threads);
 }
 
 // --- End x4x2 dequantizers ---
@@ -978,6 +1045,20 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
         return -1;
     }
 
+    worker_callback_t dequant_worker_fn = NULL;
+    switch (weight_type) {
+        case HTP_TYPE_Q4_0:   dequant_worker_fn = dequantize_x4x2_worker_loop_q4_0; break;
+        case HTP_TYPE_IQ4_NL: dequant_worker_fn = dequantize_x4x2_worker_loop_iq4_nl; break;
+        case HTP_TYPE_Q4_1:   dequant_worker_fn = dequantize_x4x2_worker_loop_q4_1; break;
+        case HTP_TYPE_MXFP4:  dequant_worker_fn = dequantize_x4x2_worker_loop_mxfp4; break;
+        case HTP_TYPE_Q8_0:   dequant_worker_fn = dequantize_x4x2_worker_loop_q8_0; break;
+        default:
+            return -1;
+    }
+
+    const int n_k_tiles = k / HMX_FP16_TILE_N_COLS;
+    const struct fastdiv_values n_k_tiles_div = init_fastdiv_values(n_k_tiles);
+
     // --- Dynamic VTCM layout ---
     const size_t vec_dot_size = k * sizeof(__fp16);
     const size_t vtcm_budget  = ctx->vtcm_size;
@@ -1070,7 +1151,7 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
         {
             // B0: wait for DMA, dequant weight chunk 0
             dma_queue_pop(ctx->dma[0]);
-            dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
+            dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn);
 
             // A1: issue DMA for weight chunk 1
             const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
@@ -1089,7 +1170,7 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
             // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
             if (1 < n_chunk_cnt) {
                 dma_queue_pop(ctx->dma[0]);
-                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
+                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn);
             }
         }
 
@@ -1131,7 +1212,7 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
             // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
             if (i + 2 < n_chunk_cnt) {
                 dma_queue_pop(ctx->dma[0]);
-                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
+                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn);
             }
         }
     }

From 0b246862b98b452770a4ce3a87a9a89b6b28a2f1 Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Thu, 28 May 2026 14:00:25 +0200
Subject: [PATCH 02/50] server: minor tweaks to use more cpp features (#23785)

* misc(server): add default port to impl RAII

* misc(server): register_gcp_compat() can be const

* misc(server): use proper cpp const/auto methods

* misc(server): do not reset a unique_ptr, use make_unique instead to be exception safe
---
 tools/server/server-http.cpp | 66 +++++++++++++++++-------------------
 tools/server/server-http.h   |  4 +--
 2 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 00290b0782d..3616b3b4d01 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -5,9 +5,9 @@
 
 #include <cpp-httplib/httplib.h>
 
-#include <cstdlib>
 #include <functional>
 #include <future>
+#include <memory>
 #include <string>
 #include <thread>
 
@@ -21,7 +21,7 @@ class server_http_context::Impl {
 };
 
 server_http_context::server_http_context()
-    : pimpl(std::make_unique<server_http_context::Impl>())
+    : pimpl(std::make_unique<Impl>())
 {}
 
 server_http_context::~server_http_context() = default;
@@ -62,7 +62,7 @@ struct gcp_params {
     }
 
     static std::string getenv(const char * name, const std::string & default_value, bool ensure_leading_slash = false) {
-        const char * value = std::getenv(name);
+        const auto * value = std::getenv(name);
         if (value == nullptr || value[0] == '\0') {
             return default_value;
         }
@@ -94,15 +94,15 @@ bool server_http_context::init(const common_params & params) {
     auto & srv = pimpl->srv;
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+    if (!params.ssl_file_key.empty() && !params.ssl_file_cert.empty()) {
         SRV_INF("running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
-        srv.reset(
-            new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
+        srv = std::make_unique<httplib::SSLServer>(
+            params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()
         );
         is_ssl = true;
     } else {
         SRV_INF("%s", "running without SSL\n");
-        srv.reset(new httplib::Server());
+        srv = std::make_unique<httplib::Server>();
     }
 #else
     if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
@@ -150,7 +150,7 @@ bool server_http_context::init(const common_params & params) {
     // set timeouts and change hostname and port
     srv->set_read_timeout (params.timeout_read);
     srv->set_write_timeout(params.timeout_write);
-    srv->set_socket_options([reuse_port = params.reuse_port](socket_t sock) {
+    srv->set_socket_options([reuse_port = params.reuse_port](const socket_t sock) {
         httplib::set_socket_opt(sock, SOL_SOCKET, SO_REUSEADDR, 1);
         if (reuse_port) {
 #ifdef SO_REUSEPORT
@@ -162,8 +162,8 @@ bool server_http_context::init(const common_params & params) {
     });
 
     if (params.api_keys.size() == 1) {
-        auto key = params.api_keys[0];
-        std::string substr = key.substr(std::max((int)(key.length() - 4), 0));
+        const auto key = params.api_keys[0];
+        const std::string substr = key.substr(std::max(static_cast<int>(key.length() - 4), 0));
         SRV_INF("api_keys: ****%s\n", substr.c_str());
     } else if (params.api_keys.size() > 1) {
         SRV_INF("api_keys: %zu keys loaded\n", params.api_keys.size());
@@ -203,7 +203,7 @@ bool server_http_context::init(const common_params & params) {
         }
 
         // remove the "Bearer " prefix if needed
-        std::string prefix = "Bearer ";
+        static std::string prefix = "Bearer ";
         if (req_api_key.substr(0, prefix.size()) == prefix) {
             req_api_key = req_api_key.substr(prefix.size());
         }
@@ -232,11 +232,10 @@ bool server_http_context::init(const common_params & params) {
     };
 
     auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) {
-        bool ready = is_ready.load();
-        if (!ready) {
+        if (!is_ready.load()) {
 #if defined(LLAMA_UI_HAS_ASSETS)
-            auto tmp = string_split<std::string>(req.path, '.');
-            if (req.path == "/" || (tmp.size() > 0 && tmp.back() == "html")) {
+            if (const auto tmp = string_split<std::string>(req.path, '.');
+                req.path == "/" || (!tmp.empty() && tmp.back() == "html")) {
                 if (const llama_ui_asset * a = llama_ui_find_asset("loading.html")) {
                     res.status = 503;
                     res.set_content(reinterpret_cast<const char*>(a->data), a->size, "text/html; charset=utf-8");
@@ -284,17 +283,17 @@ bool server_http_context::init(const common_params & params) {
         return httplib::Server::HandlerResponse::Unhandled;
     });
 
-    int n_threads_http = params.n_threads_http;
+    auto n_threads_http = params.n_threads_http;
     if (n_threads_http < 1) {
         // +4 threads for monitoring, health and some threads reserved for MCP and other tasks in the future
-        n_threads_http = std::max(params.n_parallel + 4, (int32_t) std::thread::hardware_concurrency() - 1);
+        n_threads_http = std::max(params.n_parallel + 4, static_cast<int32_t>(std::thread::hardware_concurrency() - 1));
     }
     SRV_INF("using %d threads for HTTP server\n", n_threads_http);
     srv->new_task_queue = [n_threads_http] {
         // spawn n_threads_http fixed thread (always alive), while allow up to 1024 max possible additional threads
         // when n_threads_http is used, server will create new "dynamic" threads that will be destroyed after processing each request
         // ref: https://github.com/yhirose/cpp-httplib/pull/2368
-        size_t max_threads = (size_t)n_threads_http + 1024;
+        const auto max_threads = static_cast<size_t>(n_threads_http + 1024);
         return new httplib::ThreadPool(n_threads_http, max_threads);
     };
 
@@ -310,10 +309,9 @@ bool server_http_context::init(const common_params & params) {
         // register static assets routes
         if (!params.public_path.empty()) {
             // Set the base directory for serving static files
-            bool is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path);
-            if (!is_found) {
+            if (const auto is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path); !is_found) {
                 SRV_ERR("static assets path not found: %s\n", params.public_path.c_str());
-                return 1;
+                return false;
             }
         } else {
 #if defined(LLAMA_UI_HAS_ASSETS)
@@ -353,9 +351,9 @@ bool server_http_context::init(const common_params & params) {
 bool server_http_context::start() {
     // Bind and listen
 
-    auto & srv = pimpl->srv;
-    bool was_bound = false;
-    bool is_sock = false;
+    const auto & srv = pimpl->srv;
+    auto was_bound = false;
+    auto is_sock = false;
     if (string_ends_with(std::string(hostname), ".sock")) {
         is_sock = true;
         SRV_INF("%s", "setting address family to AF_UNIX\n");
@@ -367,7 +365,7 @@ bool server_http_context::start() {
         SRV_INF("%s", "binding port with default address family\n");
         // bind HTTP listen port
         if (port == 0) {
-            int bound_port = srv->bind_to_any_port(hostname);
+            const auto bound_port = srv->bind_to_any_port(hostname);
             was_bound = (bound_port >= 0);
             if (was_bound) {
                 port = bound_port;
@@ -383,7 +381,7 @@ bool server_http_context::start() {
     }
 
     // run the HTTP server in a thread
-    thread = std::thread([this]() { pimpl->srv->listen_after_bind(); });
+    thread = std::thread([this] { pimpl->srv->listen_after_bind(); });
     srv->wait_until_ready();
 
     listening_address = is_sock ? string_format("unix://%s", hostname.c_str())
@@ -440,13 +438,13 @@ static void process_handler_response(server_http_req_ptr && request, server_http
     if (response->is_stream()) {
         res.status = response->status;
         set_headers(res, response->headers);
-        std::string content_type = response->content_type;
+        const std::string content_type = response->content_type;
         // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
-        std::shared_ptr<server_http_req> q_ptr = std::move(request);
-        std::shared_ptr<server_http_res> r_ptr = std::move(response);
-        const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
+        std::shared_ptr q_ptr = std::move(request);
+        std::shared_ptr r_ptr = std::move(response);
+        const auto chunked_content_provider = [response = r_ptr](size_t, const httplib::DataSink & sink) -> bool {
             std::string chunk;
-            bool has_next = response->next(chunk);
+            const bool has_next = response->next(chunk);
             if (!chunk.empty()) {
                 if (!sink.write(chunk.data(), chunk.size())) {
                     return false;
@@ -557,7 +555,7 @@ static std::string path_to_gcp_format(const std::string & path) {
         if (c == '/' || c == '-' || c == '_') {
             cap = true;
         } else {
-            result += cap ? (char)std::toupper(c) : (char)c;
+            result += static_cast<char>(cap ? std::toupper(c) : c);
             cap = false;
         }
     }
@@ -581,7 +579,7 @@ static json parse_gcp_predict_response(const server_http_res_ptr & res) {
     }
 }
 
-void server_http_context::register_gcp_compat() {
+void server_http_context::register_gcp_compat() const {
     const gcp_params gcp;
 
     if (!gcp.enabled) {
@@ -602,7 +600,7 @@ void server_http_context::register_gcp_compat() {
     }
 
     if (!gcp.path_health.empty()) {
-        auto health_handler = handlers.find("/health");
+        const auto health_handler = handlers.find("/health");
         GGML_ASSERT(health_handler != handlers.end());
         get(gcp.path_health, health_handler->second);
     }
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index 099b5e1cc6f..fede8c8f30a 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -73,7 +73,7 @@ struct server_http_context {
 
     std::string path_prefix;
     std::string hostname;
-    int port;
+    int port    = 8080;
     bool is_ssl = false;
 
     server_http_context();
@@ -88,7 +88,7 @@ struct server_http_context {
 
     // Register the Google Cloud Platform (Vertex AI) compat (AIP_PREDICT_ROUTE env var, or /predict)
     // Must be called AFTER all other API routes are registered
-    void register_gcp_compat();
+    void register_gcp_compat() const;
 
     // for debugging
     std::string listening_address;

From bc81d47aba663383dc8827bd3bc8c4a35f735d16 Mon Sep 17 00:00:00 2001
From: Jaden_Mach <88880593+jadenmach2@users.noreply.github.com>
Date: Thu, 28 May 2026 08:50:25 -0400
Subject: [PATCH 03/50] CUDA: route batch>=4 quantized matmul to MMQ on AMD
 MFMA hardware (#23227)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* CUDA: per-quant MMVQ/MMQ batch threshold on AMD MFMA hardware

The dispatcher uses a single global threshold (MMVQ_MAX_BATCH_SIZE = 8)
to choose between mul_mat_vec_q (per-row GEMV) and mul_mat_q (MFMA-tiled
GEMM) for quantized matmul. On AMD CDNA, the optimal crossover differs
substantially by quant family because the per-row GEMV cost is dominated
by dequantisation, not the dot-product itself: K-quants pay a heavier
super-block decode and so MMQ wins sooner; legacy and IQ quants have
lean decode and stay ahead until the batch fully populates an MFMA tile.

This patch introduces ggml_cuda_should_use_mmvq(type, cc, ne11) -> bool,
mirroring the existing ggml_cuda_should_use_mmq, and gates per-quant
thresholds on amd_mfma_available(cc):

  Q3_K, Q4_K, Q5_K  : MMVQ <= 3   (MMQ wins from batch=4: +5% .. +76%)
  Q2_K, Q6_K        : MMVQ <= 5   (MMQ wins from batch=6: +8% .. +35%)
  others            : MMVQ <= 8   (legacy & IQ regress under MMQ; unchanged)

Non-AMD-MFMA paths (NVIDIA, RDNA, CDNA1 without MFMA) are byte-identical
to master. GGML_CUDA_FORCE_MMVQ=1 restores the original global threshold
for A/B testing.

Measured on MI250X (gfx90a, ROCm 7.2.1) with Llama-3.2-3B-Instruct,
llama-bench pp512 across all 20 supported quants, ubatch 1..8, 10 reps.
Full table in PR description.

  Selected pp512 throughput (tok/s, ub=8):
    Q4_K_S:  559 -> 940  (+68%)
    Q5_K_S:  503 -> 884  (+76%)
    Q3_K_S:  629 -> 879  (+40%)
    Q2_K  :  615 -> 809  (+32%)
    Q6_K  :  582 -> 776  (+33%)

  Selected pp512 throughput (tok/s, ub=4):
    Q4_K_S:  444 -> 480  (+ 8%)
    Q4_0  :  682 -> 685  (+ 0%)   (no regression - retains MMVQ)
    IQ4_XS:  706 -> 698  (- 1%)   (no regression - retains MMVQ)

* CUDA: address review — inline MMVQ batch table, drop env hatch & doc block

* tune kernel selection logic for CDNA1

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/ggml-cuda.cu |  2 ++
 ggml/src/ggml-cuda/mmvq.cu      | 47 +++++++++++++++++++++++++++++++++
 ggml/src/ggml-cuda/mmvq.cuh     |  2 ++
 3 files changed, 51 insertions(+)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 23d1c069248..dc3e8fd6265 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2570,6 +2570,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
             use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
             use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
             use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
+            use_mul_mat_vec_q       = use_mul_mat_vec_q         && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
             any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
         }
     } else {
@@ -2578,6 +2579,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
         use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
         use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
         use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
+        use_mul_mat_vec_q       = use_mul_mat_vec_q         && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
         any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
     }
 
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index 13b8b855282..873ff05a074 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -271,6 +271,53 @@ int get_mmvq_mmid_max_batch(ggml_type type, int cc) {
     return MMVQ_MAX_BATCH_SIZE;
 }
 
+bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11) {
+    if (GGML_CUDA_CC_IS_CDNA(cc)) {
+        if (GGML_CUDA_CC_IS_CDNA1(cc)) {
+            switch (type) {
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_1:
+                    return ne11 <= 7;
+                case GGML_TYPE_Q5_1:
+                    return ne11 <= 7;
+                case GGML_TYPE_Q8_0:
+                    return ne11 <= 6;
+                case GGML_TYPE_Q2_K:
+                    return ne11 <= 4;
+                case GGML_TYPE_Q3_K:
+                    return ne11 <= 3;
+                case GGML_TYPE_Q4_K:
+                    return ne11 <= 2;
+                case GGML_TYPE_Q5_K:
+                    return ne11 <= 3;
+                case GGML_TYPE_Q6_K:
+                    return ne11 <= 4;
+                case GGML_TYPE_IQ1_S:
+                    return ne11 <= 5;
+                case GGML_TYPE_IQ2_XXS:
+                case GGML_TYPE_IQ3_S:
+                case GGML_TYPE_IQ4_XS:
+                    return ne11 <= 6;
+                default:
+                    return ne11 <= MMVQ_MAX_BATCH_SIZE;
+            }
+        }
+        switch (type) { // tuned for CDNA2
+            case GGML_TYPE_Q2_K:
+                return ne11 <= 5;
+            case GGML_TYPE_Q3_K:
+            case GGML_TYPE_Q4_K:
+            case GGML_TYPE_Q5_K:
+                return ne11 <= 3;
+            case GGML_TYPE_Q6_K:
+                return ne11 <= 5;
+            default:
+                return ne11 <= MMVQ_MAX_BATCH_SIZE;
+        }
+    }
+    return ne11 <= MMVQ_MAX_BATCH_SIZE;
+}
+
 // Device constexpr: returns the max batch size for the current arch+type at compile time.
 template <ggml_type type>
 static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() {
diff --git a/ggml/src/ggml-cuda/mmvq.cuh b/ggml/src/ggml-cuda/mmvq.cuh
index 6bf0a8e8677..5605bf7a4e6 100644
--- a/ggml/src/ggml-cuda/mmvq.cuh
+++ b/ggml/src/ggml-cuda/mmvq.cuh
@@ -2,6 +2,8 @@
 
 #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
 
+bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11);
+
 // Returns the maximum batch size for which MMVQ should be used for MUL_MAT_ID,
 // based on the quantization type and GPU architecture (compute capability).
 int get_mmvq_mmid_max_batch(ggml_type type, int cc);

From d7be46189f4a9ed59c5d98f71a87f8a16954a6eb Mon Sep 17 00:00:00 2001
From: redfox <59549776+yaohengxu@users.noreply.github.com>
Date: Thu, 28 May 2026 20:51:14 +0800
Subject: [PATCH 04/50] =?UTF-8?q?mmvq=20Optim:=20=20add=20MMVQ=5FPARAMETER?=
 =?UTF-8?q?S=5FTURING(mmvq=5Fparameter=5Ftable=5Fid)=20for=20=E2=80=A6=20(?=
 =?UTF-8?q?#23729)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* mmvq Optim:  add MMVQ_PARAMETERS_TURING(mmvq_parameter_table_id) for SM75 TURING

* avoid a mismatch for JIT compilation of Turing device code for Ampere or newer

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Copilot <copilot@github.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/mmvq.cu | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index 873ff05a074..ecb6fdedadd 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -63,6 +63,7 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
 
 enum mmvq_parameter_table_id {
     MMVQ_PARAMETERS_GENERIC = 0,
+    MMVQ_PARAMETERS_TURING,
     MMVQ_PARAMETERS_GCN,
     MMVQ_PARAMETERS_RDNA2,
     MMVQ_PARAMETERS_RDNA3_0,
@@ -78,6 +79,8 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
     return MMVQ_PARAMETERS_RDNA2;
 #elif defined(GCN) || defined(CDNA)
     return MMVQ_PARAMETERS_GCN;
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING && __CUDA_ARCH__ < GGML_CUDA_CC_AMPERE
+    return MMVQ_PARAMETERS_TURING;
 #else
     return MMVQ_PARAMETERS_GENERIC;
 #endif
@@ -96,6 +99,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
     if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
         return MMVQ_PARAMETERS_GCN;
     }
+    if (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING && ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_AMPERE) {
+        return MMVQ_PARAMETERS_TURING;
+    }
     return MMVQ_PARAMETERS_GENERIC;
 }
 
@@ -417,11 +423,38 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d
         }
         return 1;
     }
+    if (table_id == MMVQ_PARAMETERS_TURING) {
+        if (ncols_dst == 1) {
+            switch (type) {
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q3_K:
+                case GGML_TYPE_Q4_K:
+                case GGML_TYPE_Q5_K:
+                case GGML_TYPE_Q6_K:
+                    return 2;
+                default:
+                    return 4;
+            }
+        }
+        switch (ncols_dst) {
+            case 2:
+            case 3:
+            case 4:
+                return 4;
+            case 5:
+            case 6:
+            case 7:
+            case 8:
+                return 2;
+            default:
+                return 1;
+        }
+    }
     return 1;
 }
 
 static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id, bool small_k = false, int nwarps = 1) {
-    if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
+    if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN || table_id == MMVQ_PARAMETERS_TURING) {
         switch (ncols_dst) {
             case 1:
                 return small_k ? nwarps : 1;

From 30af6e2b98b00eee01a8f76249fe1399a724702e Mon Sep 17 00:00:00 2001
From: fl0rianr <226492742+fl0rianr@users.noreply.github.com>
Date: Thu, 28 May 2026 15:01:14 +0200
Subject: [PATCH 05/50] ggml: auto apply iGPU flag CUDA/HIP if integrated
 device (#23007)

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index dc3e8fd6265..18aaa098398 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4994,8 +4994,14 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
 }
 
 static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
-    GGML_UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_GPU;
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *) dev->context;
+
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device));
+
+    return prop.integrated
+        ? GGML_BACKEND_DEVICE_TYPE_IGPU
+        : GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {

From d374e71e5540a2e77b1ab2cd0e797b8ea3649755 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 28 May 2026 15:53:54 +0200
Subject: [PATCH 06/50] test-llama-archs: fix table format [no release]
 (#23810)

---
 tests/test-llama-archs.cpp | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp
index 16af11a2862..25e29638e97 100644
--- a/tests/test-llama-archs.cpp
+++ b/tests/test-llama-archs.cpp
@@ -12,6 +12,7 @@
 #include "../src/llama-model-saver.h"
 
 #include <cinttypes>
+#include <cstddef>
 #include <cstdio>
 #include <cstring>
 #include <cstdint>
@@ -497,6 +498,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
     };
 
     std::vector<device_config> dev_configs;
+    size_t max_device_label_length = 4;
     {
         std::vector<ggml_backend_dev_t> devices_meta;
         {
@@ -504,6 +506,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
             for (size_t i = 0; i < device_count; i++) {
                 ggml_backend_dev_t dev = ggml_backend_dev_get(i);
                 dev_configs.emplace_back(std::vector<ggml_backend_dev_t>{dev}, ggml_backend_dev_description(dev), LLAMA_SPLIT_MODE_LAYER);
+                max_device_label_length = std::max(max_device_label_length, dev_configs.back().label.length());
 
                 // cpu-based devices cannot be used in tensor split mode
                 if (ggml_backend_dev_buffer_type(dev) != ggml_backend_cpu_buffer_type()) {
@@ -515,10 +518,26 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
         dev_configs.emplace_back(devices_meta, "Meta", LLAMA_SPLIT_MODE_TENSOR);
     }
 
+    size_t max_arch_name_length = 0;
+    for (const llm_arch & arch : llm_arch_all()) {
+        max_arch_name_length = std::max(max_arch_name_length, strlen(llm_arch_name(arch)));
+    }
+
+    const std::string template_header = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|%15s|%9s|\n";
+    const std::string template_row    = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|%15s %10s|%20s|\n";
+
     bool all_ok = true;
     common_log_flush(common_log_main());
-    printf("|%16s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
-    printf("|----------------|------------------------------|------|---------------|---------|\n");
+    printf(template_header.c_str(), "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
+    printf("|");
+    for (size_t i = 0; i < max_arch_name_length; i++) {
+        printf("-");
+    }
+    printf("|");
+    for (size_t i = 0; i < max_device_label_length; i++) {
+        printf("-");
+    }
+    printf("|------|---------------|---------|\n");
     for (const llm_arch & arch : llm_arch_all()) {
         if (arch == LLM_ARCH_UNKNOWN) {
             continue;
@@ -595,7 +614,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
                     }
                 }
 
-                printf("|%16s|%30s|%6s|%15s %10s|%20s|\n", llm_arch_name(arch), dc.label.c_str(),
+                printf(template_row.c_str(), llm_arch_name(arch), dc.label.c_str(),
                     config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
             }
         }

From 7fb1e70b594839107770046fe7d9475ae569e42a Mon Sep 17 00:00:00 2001
From: Mikolaj Kucharski <mikolaj@kucharski.name>
Date: Thu, 28 May 2026 14:25:40 +0000
Subject: [PATCH 07/50] arg: Add LLAMA_ARG_API_KEY_FILE environment variable
 for --api-key-file (#23167)

---
 common/arg.cpp         | 2 +-
 tools/server/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index bdc2e9eb4fc..f6fdd4fa63f 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2998,7 +2998,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             key_file.close();
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_KEY_FILE"));
     add_opt(common_arg(
         {"--ssl-key-file"}, "FNAME",
         "path to file a PEM-encoded SSL private key",
diff --git a/tools/server/README.md b/tools/server/README.md
index 0d20ced879f..b975088e4ae 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -201,7 +201,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
-| `--api-key-file FNAME` | path to file containing API keys (default: none) |
+| `--api-key-file FNAME` | path to file containing API keys, one per line (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
 | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |

From dd1557907ae50e11814e25609b30825b67963663 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 May 2026 17:29:11 +0300
Subject: [PATCH 08/50] ci : change Vulkan builds to Release to reduce ccache
 (#23820)

* ci : disable all CPU variant builds for Vulkan workflow

* cont : change cache key

* cont : change build type
---
 .github/workflows/build-vulkan.yml | 34 ++++++++++++++----------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/build-vulkan.yml b/.github/workflows/build-vulkan.yml
index e6eab8fd0aa..d473b14c11d 100644
--- a/.github/workflows/build-vulkan.yml
+++ b/.github/workflows/build-vulkan.yml
@@ -52,14 +52,6 @@ jobs:
         id: checkout
         uses: actions/checkout@v6
 
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: vulkan-${{ matrix.os }}
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
       - name: Dependencies
         id: depends
         run: |
@@ -68,14 +60,20 @@ jobs:
           echo "CC=gcc-14" >> "$GITHUB_ENV"
           echo "CXX=g++-14" >> "$GITHUB_ENV"
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: vulkan-${{ matrix.os }}-new
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
       - name: Configure
         id: cmake_configure
         run: |
           cmake -B build \
             -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DCMAKE_BUILD_TYPE=Release \
             -DGGML_VULKAN=ON
 
       - name: Build
@@ -91,13 +89,6 @@ jobs:
         id: checkout
         uses: actions/checkout@v6
 
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: vulkan-ubuntu-24.04-llvmpipe
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
       - name: Dependencies
         id: depends
         run: |
@@ -124,6 +115,13 @@ jobs:
           path: ./vulkan_sdk
           version: ${{ env.VULKAN_SDK_VERSION }}
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: vulkan-ubuntu-24.04-llvmpipe
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
       - name: Build
         id: cmake_build
         run: |

From d6be3158e1fadc44ba6c1706dba1b00cd5c9ba88 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Thu, 28 May 2026 16:31:37 +0200
Subject: [PATCH 09/50] mtmd: fix gemma 4 audio rms norm eps (#23815)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* mtmd: fix gemma 4 audio rms norm eps

* Update tools/mtmd/clip.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 conversion/gemma.py | 11 ++++++-----
 tools/mtmd/clip.cpp |  3 +++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/conversion/gemma.py b/conversion/gemma.py
index 1b427a30cd5..76beedcf0d3 100644
--- a/conversion/gemma.py
+++ b/conversion/gemma.py
@@ -786,14 +786,15 @@ def set_gguf_parameters(self):
         super().set_gguf_parameters()
 
         # vision params
+        assert self.hparams_vision is not None
         self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V)
-        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
 
         # audio params
-        if self.hparams_audio:
-            self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
-            self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
-            self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
+        assert self.hparams_audio is not None
+        self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
+        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
+        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))
 
     def is_audio_tensor(self, name: str) -> bool:
         return "audio_tower" in name or "embed_audio" in name
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 5fd583d40bc..bbcae7609b9 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1552,6 +1552,9 @@ struct clip_model_loader {
                         hparams.audio_n_fft            = 512;
                         hparams.audio_window_len       = 320;  // 20ms frame (NOT 25ms/400)
                         hparams.audio_hop_len          = 160;
+                        // due to a mistake in the original conversion code, rms_norm_eps is set to a wrong value
+                        // since all gemma4a models use 1e-6, we just hardcode it here to avoid re-conversion
+                        hparams.eps = 1e-6f;
                     } break;
                 case PROJECTOR_TYPE_GRANITE_SPEECH:
                     {

From 0b56d283bf912cfec59f9c11716df6fffb0767b4 Mon Sep 17 00:00:00 2001
From: Saba Fallah <10401143+sfallah@users.noreply.github.com>
Date: Thu, 28 May 2026 16:44:36 +0200
Subject: [PATCH 10/50] mtmd: n_head_kv defaults to n_head (#23782)

removed AI-generated comment
---
 tools/mtmd/clip-graph.h |  1 +
 tools/mtmd/clip.cpp     | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h
index c5e880c71ec..1d9f6a136a9 100644
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -29,6 +29,7 @@ struct clip_graph {
     const int n_patches;
     const int n_embd;
     const int n_head;
+    const int n_head_kv;
     const int d_head;
     const int n_layer;
     const int n_mmproj_embd;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index bbcae7609b9..a7aa297c598 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -246,6 +246,7 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
         n_patches(n_patches_x * n_patches_y),
         n_embd(hparams.n_embd),
         n_head(hparams.n_head),
+        n_head_kv(hparams.n_head_kv),
         d_head(n_embd / n_head),
         n_layer(hparams.n_layer),
         n_mmproj_embd(clip_n_mmproj_embd(ctx)),
@@ -401,9 +402,9 @@ ggml_tensor * clip_graph::build_vit(
                     }
                 }
 
-                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head,    n_pos);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head_kv, n_pos);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head_kv, n_pos);
 
                 if (norm_per_head) {
                     if (layer.q_norm) {
@@ -1120,6 +1121,9 @@ struct clip_model_loader {
             get_u32(string_format(KEY_PROJ_DIM,       prefix), hparams.projection_dim);
             get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
 
+            // n_head_kv is optional (for GQA), default to n_head
+            hparams.n_head_kv = hparams.n_head;
+
             if (is_vision) {
                 get_u32(KEY_IMAGE_SIZE, hparams.image_size);
                 get_u32(KEY_PATCH_SIZE, hparams.patch_size);

From 479a9a1b03f831d416ec0882165b29e8bed5d98d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <angt@huggingface.co>
Date: Thu, 28 May 2026 16:45:06 +0200
Subject: [PATCH 11/50] app : improve help output (#23805)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
---
 app/llama.cpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/app/llama.cpp b/app/llama.cpp
index b0b86fd47d9..0e932c35537 100644
--- a/app/llama.cpp
+++ b/app/llama.cpp
@@ -17,6 +17,8 @@ int llama_fit_params(int argc, char ** argv);
 int llama_quantize(int argc, char ** argv);
 int llama_perplexity(int argc, char ** argv);
 
+static const char * progname;
+
 static int help(int argc, char ** argv);
 static int version(int argc, char ** argv);
 
@@ -37,8 +39,8 @@ static const command cmds[] = {
     {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,  llama_fit_params   },
     {"quantize",      "Quantize a model",                                   {},           true,  llama_quantize     },
     {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,  llama_perplexity   },
-    {"version",       "Show version",                                       {},           true,  version            },
-    {"help",          "Show available commands",                            {},           true,  help               },
+    {"version",       "Show version",                                       {},           false, version            },
+    {"help",          "Show available commands",                            {},           false, help               },
 };
 
 static int version(int argc, char ** argv) {
@@ -49,14 +51,19 @@ static int version(int argc, char ** argv) {
 static int help(int argc, char ** argv) {
     const bool show_all = argc >= 2 && std::string(argv[1]) == "all";
 
-    printf("Usage: llama <command> [options]\n\nAvailable commands:\n");
+    printf("Usage: %s <command> [options]\n\nAvailable commands:\n", progname);
 
     for (const auto & cmd : cmds) {
         if (show_all || !cmd.hidden) {
             printf("  %-15s %s\n", cmd.name, cmd.desc);
         }
     }
-    printf("\nRun 'llama <command> --help' for command-specific usage.\n");
+    printf("\n");
+
+    if (!show_all) {
+        printf("Run '%s help all' to show additional commands.\n", progname);
+    }
+    printf("Run '%s <command> --help' for command-specific usage.\n", progname);
 
     return 0;
 }
@@ -74,13 +81,12 @@ static bool matches(const std::string & arg, const command & cmd) {
 }
 
 int main(int argc, char ** argv) {
+    progname = argv[0];
     const std::string arg = argc >= 2 ? argv[1] : "help";
 
     for (const auto & cmd : cmds) {
         if (matches(arg, cmd)) {
-
-            // router spawns children through this same binary, it needs the
-            // subcommand to relaunch as 'llama serve' and not bare options
+            // keep cmd.name so the router's child processes re-invoke correctly
 #ifdef _WIN32
             _putenv_s("LLAMA_APP_CMD", cmd.name);
 #else

From 445b7cef62318aabc755ff17745a1d5cbc95360e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 May 2026 17:50:32 +0300
Subject: [PATCH 12/50] ci : releases use Github-hosted builds for the UI
 (#23823)

* ci : releases use Github-hosted builds for the UI

* cont : fix name
---
 .github/workflows/ui-build-self-hosted.yml | 43 ++++++++++++++++++++++
 .github/workflows/ui-build.yml             |  2 +-
 .github/workflows/ui-self-hosted.yml       |  6 +--
 3 files changed, 47 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/ui-build-self-hosted.yml

diff --git a/.github/workflows/ui-build-self-hosted.yml b/.github/workflows/ui-build-self-hosted.yml
new file mode 100644
index 00000000000..e5d576cda62
--- /dev/null
+++ b/.github/workflows/ui-build-self-hosted.yml
@@ -0,0 +1,43 @@
+name: UI Build (self-hosted)
+
+on:
+  workflow_call:
+
+jobs:
+  build:
+    runs-on: [self-hosted, fast]
+    env:
+      BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/ui/package-lock.json"
+
+      - name: Install dependencies
+        run: npm ci
+        working-directory: tools/ui
+
+      - name: Build application
+        run: npm run build
+        working-directory: tools/ui
+
+      - name: Generate checksums
+        run: |
+          cd tools/ui/dist
+          for f in *; do
+            sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
+          done
+
+      - name: Upload built UI
+        uses: actions/upload-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/
+          retention-days: 1
diff --git a/.github/workflows/ui-build.yml b/.github/workflows/ui-build.yml
index 2653afd06c7..92b0573fb8d 100644
--- a/.github/workflows/ui-build.yml
+++ b/.github/workflows/ui-build.yml
@@ -5,7 +5,7 @@ on:
 
 jobs:
   build:
-    runs-on: [self-hosted, fast]
+    runs-on: ubuntu-slim
     env:
       BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
 
diff --git a/.github/workflows/ui-self-hosted.yml b/.github/workflows/ui-self-hosted.yml
index 8a97a8284e5..5457d900c87 100644
--- a/.github/workflows/ui-self-hosted.yml
+++ b/.github/workflows/ui-self-hosted.yml
@@ -16,7 +16,7 @@ on:
       - master
     paths: [
       '.github/workflows/ui-self-hosted.yml',
-      '.github/workflows/ui-build.yml',
+      '.github/workflows/ui-build-self-hosted.yml',
       'tools/ui/**.*',
       'tools/server/tests/**.*'
     ]
@@ -24,7 +24,7 @@ on:
     types: [opened, synchronize, reopened]
     paths: [
       '.github/workflows/ui-self-hosted.yml',
-      '.github/workflows/ui-build.yml',
+      '.github/workflows/ui-build-self-hosted.yml',
       'tools/ui/**.*',
       'tools/server/tests/**.*'
     ]
@@ -42,7 +42,7 @@ concurrency:
 jobs:
   ui-build:
     name: Build static output
-    uses: ./.github/workflows/ui-build.yml
+    uses: ./.github/workflows/ui-build-self-hosted.yml
 
   ui-checks:
     name: Checks

From 2f6c815dc450106ef877ae32f4472bfd5cf83e47 Mon Sep 17 00:00:00 2001
From: ValdikSS <iam@valdikss.org.ru>
Date: Thu, 28 May 2026 18:36:10 +0300
Subject: [PATCH 13/50] ui: fix audio and video modality detection (#23756)

When model props are fetched asynchronously from the server,
modelPropsVersion is incremented to trigger reactivity, but
only the vision effect was listening to it.
---
 .../chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte
index 07f079f5b51..fd866b243ea 100644
--- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte
@@ -106,10 +106,14 @@
 	});
 
 	$effect(() => {
+		void modelPropsVersion;
+
 		hasAudioModality = activeModelId ? modelsStore.modelSupportsAudio(activeModelId) : false;
 	});
 
 	$effect(() => {
+		void modelPropsVersion;
+
 		hasVideoModality = activeModelId ? modelsStore.modelSupportsVideo(activeModelId) : false;
 	});
 

From 3ef236955173431f311e9cd6b12b61bbb56254d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Thu, 28 May 2026 19:58:32 +0200
Subject: [PATCH 14/50] ci : run ui publish on ubuntu-slim (#23818)

* run ui publish on self-hosted fast

* run on ubuntu-slim
---
 .github/workflows/ui-publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ui-publish.yml b/.github/workflows/ui-publish.yml
index 8a0d991930c..cec0fa52a12 100644
--- a/.github/workflows/ui-publish.yml
+++ b/.github/workflows/ui-publish.yml
@@ -20,7 +20,7 @@ jobs:
   publish:
     name: Publish UI Static Output
     needs: build
-    runs-on: ubuntu-24.04-arm
+    runs-on: ubuntu-slim
 
     permissions:
       contents: read

From 408ae2b9e5a073d2897b2317890cc8dff5cf3cec Mon Sep 17 00:00:00 2001
From: lhez <lih@qti.qualcomm.com>
Date: Thu, 28 May 2026 11:05:42 -0700
Subject: [PATCH 15/50] opencl: move backend info printing into its own
 function (#23702)

* opencl: move backend info print into its own function

* opencl: move new log line

* opencl: fix for non adreno path
---
 ggml/src/ggml-opencl/ggml-opencl.cpp | 155 +++++++++++++++------------
 1 file changed, 86 insertions(+), 69 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 6d6c3e8973d..751ec6116c0 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -379,6 +379,8 @@ struct ggml_backend_opencl_device_context {
     GPU_FAMILY     gpu_family = GPU_FAMILY::UNKNOWN;
     ADRENO_GPU_GEN adreno_gen = ADRENO_GPU_GEN::ADRENO_UNKNOWN;
 
+    std::regex *opfilter = nullptr; // regex of ops to not claim
+    std::string opfilter_str; // regex string for opfilter
     size_t global_mem_size = 0;
 };
 
@@ -415,8 +417,6 @@ struct ggml_backend_opencl_context {
     bool has_qcom_subgroup_shuffle = false;     // cl_qcom_subgroup_shuffle
     bool disable_fusion;
 
-    std::regex *opfilter = nullptr; // regex of ops to not claim
-
     bool adreno_has_large_buffer;
     bool adreno_use_large_buffer;
     ggml_cl_compiler_version adreno_cl_compiler_version;
@@ -428,6 +428,8 @@ struct ggml_backend_opencl_context {
     size_t  image2d_max_width;
     size_t  image2d_max_height;
 
+    cl_device_svm_capabilities svm_caps;
+
     cl_context context;
     cl_command_queue queue;
 
@@ -3731,6 +3733,68 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
     return found_devices;
 }
 
+static void ggml_opencl_print_backend_info(ggml_backend_opencl_device_context * dev_ctx) {
+    GGML_ASSERT(dev_ctx);
+    GGML_ASSERT(dev_ctx->backend_ctx);
+
+    auto * backend_ctx = dev_ctx->backend_ctx;
+
+    GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n",
+        backend_ctx->driver_version.c_str());
+    GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
+        backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n",
+        backend_ctx->fp16_support ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n",
+        backend_ctx->alignment);
+    GGML_LOG_INFO("ggml_opencl: global mem size: %zu MB\n",
+        backend_ctx->global_mem_size/1024/1024);
+    GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n",
+        backend_ctx->max_alloc_size/1024/1024);
+    GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n",
+        backend_ctx->image_max_buffer_size);
+    GGML_LOG_INFO("ggml_opencl: device max image2d size: %lu x %lu\n",
+        backend_ctx->image2d_max_width, backend_ctx->image2d_max_height);
+    GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n",
+        backend_ctx->max_workgroup_size);
+    GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n",
+        backend_ctx->svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n",
+        backend_ctx->svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n",
+        backend_ctx->svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
+        backend_ctx->svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: cl_qcom_subgroup_shuffle support: %s\n",
+        backend_ctx->has_qcom_subgroup_shuffle ? "true" : "false");
+
+    // Print out configurations
+#ifdef GGML_OPENCL_SOA_Q
+    GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
+#endif // GGML_OPENCL_SOA_Q
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
+    if (backend_ctx->adreno_xmem_gemm_enabled) {
+        GGML_LOG_INFO("ggml_opencl: Adreno xmem F16xF32 GEMM enabled (temporary weight prepack)\n");
+    }
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+    if (backend_ctx->adreno_use_large_buffer) {
+        if (!backend_ctx->adreno_has_large_buffer) {
+            GGML_LOG_INFO("ggml_opencl: Adreno large buffer requested but not supported by driver, will use regular buffer\n");
+            backend_ctx->adreno_use_large_buffer = false;
+        } else {
+            GGML_LOG_INFO("ggml_opencl: Adreno large buffer enabled\n");
+        }
+    }
+
+    if (dev_ctx->opfilter) {
+        // for information only, the actual regex object is created in ggml_opencl_is_device_supported
+        GGML_LOG_INFO("ggml_opencl: opfilter regex = \"%s\"\n", dev_ctx->opfilter_str.c_str());
+    }
+}
+
 // check if device should be accepted
 static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev) {
     GGML_ASSERT(dev);
@@ -3799,6 +3863,13 @@ static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev) {
     }
 
     clGetDeviceInfo(dev_ctx->device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &dev_ctx->global_mem_size, NULL);
+
+    const char * str_opfilter = getenv("GGML_OPENCL_OPFILTER");
+    if (str_opfilter) {
+        dev_ctx->opfilter_str = str_opfilter;
+        dev_ctx->opfilter = new std::regex(str_opfilter, std::regex_constants::icase);
+    }
+
     return true;
 }
 
@@ -3850,15 +3921,12 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
     char *driver_version = (char *)alloca(driver_version_str_size + 1);
     clGetDeviceInfo(device, CL_DRIVER_VERSION, driver_version_str_size, driver_version, NULL);
     driver_version[driver_version_str_size] = '\0';
-    GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version);
     backend_ctx->driver_version = driver_version;
 
     backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
     backend_ctx->has_vector_subgroup_broadcast =
         (backend_ctx->adreno_cl_compiler_version.type == E031 && backend_ctx->adreno_cl_compiler_version.major >= 47) ||
         (backend_ctx->adreno_cl_compiler_version.type == DX   && backend_ctx->adreno_cl_compiler_version.major >= 17);
-    GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
-        backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
 
     size_t ext_str_size;
     clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
@@ -3867,18 +3935,12 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
     ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
 
     // check support for qcom_subgroup_shuffle
-    if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") != NULL) {
-        GGML_LOG_INFO("ggml_opencl: cl_khr_subgroups support: true\n");
-        if (strstr(ext_buffer, "cl_qcom_subgroup_shuffle") != NULL) {
-            backend_ctx->has_qcom_subgroup_shuffle = true;
-        }
+    if (strstr(ext_buffer, "cl_qcom_subgroup_shuffle") != NULL) {
+        backend_ctx->has_qcom_subgroup_shuffle = true;
     }
-    GGML_LOG_INFO("ggml_opencl: cl_qcom_subgroup_shuffle support: %s\n",
-        backend_ctx->has_qcom_subgroup_shuffle ? "true" : "false");
 
     // Check if ext_buffer contains cl_khr_fp16
     backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
-    GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
 
     // check Adreno large buffer support
     backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL;
@@ -3887,35 +3949,15 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
     CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
     GGML_ASSERT(base_align_in_bits % 8u == 0);
     backend_ctx->alignment = base_align_in_bits / 8u;
-    GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
 
     backend_ctx->global_mem_size = dev_ctx->global_mem_size;
-    GGML_LOG_INFO("ggml_opencl: global mem size: %zu MB\n", backend_ctx->global_mem_size/1024/1024);
-
-    clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
-    GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
-
-    clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL);
-    GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size);
 
-    clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->image2d_max_width, NULL);
-    clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->image2d_max_height, NULL);
-    GGML_LOG_INFO("ggml_opencl: device max image2d size: %lu x %lu\n", backend_ctx->image2d_max_width, backend_ctx->image2d_max_height);
-
-    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL);
-    GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size);
-
-    // Check SVM.
-    cl_device_svm_capabilities svm_caps;
-    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0));
-    GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n",
-        svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false");
-    GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n",
-        svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false");
-    GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n",
-        svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false");
-    GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
-        svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL));
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL));
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->image2d_max_width, NULL));
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->image2d_max_height, NULL));
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL));
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &backend_ctx->svm_caps, 0));
 
     if (opencl_c_version.major >= 3) {
         // Assume it is not available for 3.0, since it is optional in 3.0.
@@ -3931,36 +3973,15 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
         backend_ctx->non_uniform_workgroups = true;
     }
 
-    // Print out configurations
-#ifdef GGML_OPENCL_SOA_Q
-    GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
-#endif // GGML_OPENCL_SOA_Q
-
-#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
-    GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
-#endif // GGML_OPENCL_USE_ADRENO_KERNELS
-
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    // determine whether to use Adreno xmem GEMM
     backend_ctx->adreno_xmem_gemm_enabled = getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr &&
                                              backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
-    if (getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr) {
-        GGML_LOG_INFO("ggml_opencl: Adreno xmem F16xF32 GEMM %s\n",
-                      backend_ctx->adreno_xmem_gemm_enabled ?
-                      "enabled (temporary weight prepack)" : "requested but unsupported by this driver");
-    }
-#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+#endif
 
     // determine whether to use large buffer for Adreno
     backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr &&
                                            backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
-    if (backend_ctx->adreno_use_large_buffer) {
-        if (!backend_ctx->adreno_has_large_buffer) {
-            GGML_LOG_INFO("ggml_opencl: Adreno large buffer requested but not supported by driver, will use regular buffer\n");
-            backend_ctx->adreno_use_large_buffer = false;
-        } else {
-            GGML_LOG_INFO("ggml_opencl: Adreno large buffer enabled\n");
-        }
-    }
 
     cl_int err;
 
@@ -4010,12 +4031,6 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
 
     backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr;
 
-    const char * str_opfilter = getenv("GGML_OPENCL_OPFILTER");
-    if (str_opfilter) {
-        backend_ctx->opfilter = new std::regex(str_opfilter, std::regex_constants::icase);
-        GGML_LOG_INFO("ggml_opencl: opfilter regex = \"%s\"\n", str_opfilter);
-    }
-
     dev_ctx->backend_ctx = backend_ctx.release();
     return dev_ctx->backend_ctx;
 }
@@ -4825,7 +4840,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
     ggml_backend_opencl_context *        backend_ctx = dev_ctx->backend_ctx;
 
     // reject ops that match the opfilter regex
-    if (backend_ctx->opfilter && std::regex_match(std::string(ggml_op_desc(op)), *backend_ctx->opfilter)) {
+    if (dev_ctx->opfilter && std::regex_match(std::string(ggml_op_desc(op)), *dev_ctx->opfilter)) {
         return false;
     }
 
@@ -7823,6 +7838,8 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
         /* .context   = */ backend_ctx,
     };
 
+    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
+    ggml_opencl_print_backend_info(dev_ctx);
     return backend;
 
     GGML_UNUSED(params);

From c8914ad4f44e7312a8635cb52250c46cb0fe47d9 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Thu, 28 May 2026 20:58:55 +0200
Subject: [PATCH 16/50] mtmd: fix gemma 4 projector pre_norm (#23822)

---
 tools/mtmd/models/gemma4v.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/mtmd/models/gemma4v.cpp b/tools/mtmd/models/gemma4v.cpp
index 4068a08aaf9..3570d6da135 100644
--- a/tools/mtmd/models/gemma4v.cpp
+++ b/tools/mtmd/models/gemma4v.cpp
@@ -124,12 +124,12 @@ ggml_cgraph * clip_graph_gemma4v::build() {
     }
 
     // Gemma4MultimodalEmbedder
-    cur = build_mm(model.mm_input_proj_w, cur);
-    cb(cur, "projected", -1);
-
-    // embedding_post_projection_norm
-    cur = ggml_rms_norm(ctx0, cur, hparams.eps);
-    cb(cur, "projected_normed", -1);
+    {
+        // embedding_pre_projection_norm
+        cur = ggml_rms_norm(ctx0, cur, hparams.eps);
+        cur = build_mm(model.mm_input_proj_w, cur);
+        cb(cur, "projected", -1);
+    }
 
     ggml_build_forward_expand(gf, cur);
     return gf;

From 751ebd17a58a8a513994509214373bb9e6a3d66c Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Thu, 28 May 2026 20:59:14 +0200
Subject: [PATCH 17/50] mtmd-debug: add color and rainbow mode (#23829)

* mtmd-debug: add color and rainbow mode

* fix M_PI

* max_dist
---
 tools/mtmd/debug/mtmd-debug.cpp | 61 +++++++++++++++++++++++++++++++++
 tools/mtmd/debug/mtmd-debug.md  | 37 ++++++++++++++++++++
 2 files changed, 98 insertions(+)

diff --git a/tools/mtmd/debug/mtmd-debug.cpp b/tools/mtmd/debug/mtmd-debug.cpp
index f19ca4cfe29..b88a16f0f8b 100644
--- a/tools/mtmd/debug/mtmd-debug.cpp
+++ b/tools/mtmd/debug/mtmd-debug.cpp
@@ -30,7 +30,9 @@ static void show_additional_info(int /*argc*/, char ** argv) {
         "    -p \"encode\" (debugging encode pass, default case):\n"
         "        --image can be:\n"
         "          \"white\", \"black\", \"gray\": filled 1.0f, 0.0f and 0.5f respectively\n"
+        "          \"red\", \"green\", \"blue\": filled with respective colors\n"
         "          \"cb\": checkerboard pattern, alternate 1.0f and 0.0f\n"
+        "          \"rainbow\": raspberry-pi-like rainbow pattern\n"
         "        --audio can be:\n"
         "          \"one\", \"zero\", \"half\": filled 1.0f, 0.0f and 0.5f respectively\n"
         "          \"1010\": checkerboard pattern, alternate 1.0f and 0.0f\n"
@@ -144,6 +146,65 @@ int main(int argc, char ** argv) {
                     image[y][x * 3 + 2] = v;
                 }
             }
+        } else if (input == "red") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 0.0f);
+                for (int j = 0; j < inp_size; ++j) {
+                    row[j * 3 + 0] = 1.0f;  // R channel
+                }
+                image.push_back(row);
+            }
+        } else if (input == "green") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 0.0f);
+                for (int j = 0; j < inp_size; ++j) {
+                    row[j * 3 + 1] = 1.0f;  // G channel
+                }
+                image.push_back(row);
+            }
+        } else if (input == "blue") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 0.0f);
+                for (int j = 0; j < inp_size; ++j) {
+                    row[j * 3 + 2] = 1.0f;  // B channel
+                }
+                image.push_back(row);
+            }
+        } else if (input == "rainbow") {
+            for (int i = 0; i < inp_size; ++i) {
+                image.push_back(std::vector<float>(inp_size * 3, 0.0f));
+            }
+            float cx = inp_size / 2.0f;
+            float cy = inp_size / 2.0f;
+            float max_dist = std::sqrt(cx * cx + cy * cy);
+            for (int y = 0; y < inp_size; ++y) {
+                for (int x = 0; x < inp_size; ++x) {
+                    float dx = x - cx;
+                    float dy = y - cy;
+                    float hue = std::atan2(dy, dx) / (2.0f * 3.14159265f);
+                    if (hue < 0) hue += 1.0f;
+                    float sat = std::sqrt(dx * dx + dy * dy) / max_dist;
+                    if (sat > 1.0f) sat = 1.0f;
+                    float h6 = hue * 6.0f;
+                    int i6 = (int)h6;
+                    float f = h6 - i6;
+                    float p = 1.0f - sat;
+                    float q = 1.0f - sat * f;
+                    float t = 1.0f - sat * (1.0f - f);
+                    float r, g, b;
+                    switch (i6 % 6) {
+                        case 0: r=1; g=t; b=p; break;
+                        case 1: r=q; g=1; b=p; break;
+                        case 2: r=p; g=1; b=t; break;
+                        case 3: r=p; g=q; b=1; break;
+                        case 4: r=t; g=p; b=1; break;
+                        default: r=1; g=p; b=q; break;
+                    }
+                    image[y][x * 3 + 0] = r;
+                    image[y][x * 3 + 1] = g;
+                    image[y][x * 3 + 2] = b;
+                }
+            }
         } else if (input == "one") {
             samples = std::vector<float>(inp_size, 1.0f);
         } else if (input == "zero") {
diff --git a/tools/mtmd/debug/mtmd-debug.md b/tools/mtmd/debug/mtmd-debug.md
index 76ffe5c8451..71bd52dd4b3 100644
--- a/tools/mtmd/debug/mtmd-debug.md
+++ b/tools/mtmd/debug/mtmd-debug.md
@@ -20,6 +20,43 @@ def test_vision():
 test_vision()
 ```
 
+Example of debugging a rainbow image:
+
+```py
+import torch
+import math
+
+def make_rainbow(img_size):
+    cx, cy = img_size / 2.0, img_size / 2.0
+    max_dist = math.sqrt(cx * cx + cy * cy)
+    img = torch.zeros(1, 3, img_size, img_size)
+    for y in range(img_size):
+        for x in range(img_size):
+            dx, dy = x - cx, y - cy
+            hue = math.atan2(dy, dx) / (2 * math.pi)
+            if hue < 0:
+                hue += 1
+            sat = math.sqrt(dx * dx + dy * dy) / max_dist
+            sat = min(sat, 1.0)
+            h6 = hue * 6
+            i6 = int(h6)
+            f = h6 - i6
+            p = 1 - sat
+            q = 1 - sat * f
+            t = 1 - sat * (1 - f)
+            rgb = [(1,t,p),(q,1,p),(p,1,t),(p,q,1),(t,p,1),(1,p,q)][i6 % 6]
+            img[0, 0, y, x] = rgb[0]
+            img[0, 1, y, x] = rgb[1]
+            img[0, 2, y, x] = rgb[2]
+    return img
+
+img_size = 896
+pixel_values = make_rainbow(img_size)
+with torch.no_grad():
+    outputs = model.model.get_image_features(pixel_values=pixel_values)
+print("last_hidden_state:", outputs.last_hidden_state)
+```
+
 ## Debugging preprocess pass
 
 (TODO)

From 19e92c33ef974661e4b1e43dd48be231d07be5ed Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Thu, 28 May 2026 14:05:54 -0700
Subject: [PATCH 18/50] hexagon: basic/generic op fusion support and
 RMS_NORM+MUL fusion (#23835)

Updating infra to enable op fusion and using RMS_NORM+MUL as the use-case.
---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp     | 143 ++++++------
 ggml/src/ggml-hexagon/htp-opnode.h         | 241 +++++++++++++++++++++
 ggml/src/ggml-hexagon/htp/htp-ops.h        |   1 +
 ggml/src/ggml-hexagon/htp/main.c           |   1 +
 ggml/src/ggml-hexagon/htp/unary-ops.c      | 194 ++++++++++++++++-
 ggml/src/ggml-hexagon/op-desc.h            | 153 -------------
 scripts/snapdragon/ggml-hexagon-profile.py |   2 +-
 7 files changed, 498 insertions(+), 237 deletions(-)
 create mode 100644 ggml/src/ggml-hexagon/htp-opnode.h
 delete mode 100644 ggml/src/ggml-hexagon/op-desc.h

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 3af7aff7028..48ded82e83c 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -39,7 +39,7 @@
 #include "ggml-hexagon.h"
 #include "ggml-impl.h"
 #include "ggml-quants.h"
-#include "op-desc.h"
+#include "htp-opnode.h"
 #include "htp-ops.h"
 #include "htp_iface.h"
 #include "htp-drv.h"
@@ -102,23 +102,23 @@ static const char * status_to_str(uint32_t status) {
 
 // ** debug helpers
 
-static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) {
+static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const htp_opnode & node, const uint32_t req_flags) {
     if (!opt_verbose) return;
 
-    op_desc desc(op);
+    htp_opformat fmt(node);
     GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
-                ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags);
+                node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, req_flags);
 }
 
 static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
     if (!opt_verbose) return;
 
-    op_desc desc(op);
+    htp_opformat fmt(htp_opformat(htp_opnode{const_cast<ggml_tensor*>(op), {}, HTP_OP_INVALID}));
     GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
-                ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no");
+                ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no");
 }
 
-static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
+static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node,
                                       uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
     if (!opt_profile) return;
 
@@ -129,15 +129,16 @@ static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_t
                 pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
     }
 
-    op_desc desc(op);
+    htp_opformat fmt(node);
     GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
-            ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
+            node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str);
 }
 
 // ** backend sessions
 
 struct ggml_hexagon_opbatch;
 struct ggml_hexagon_opqueue;
+struct htp_opnode;
 
 struct ggml_hexagon_session {
     std::string      name;
@@ -167,7 +168,7 @@ struct ggml_hexagon_session {
     void allocate(int dev_id) noexcept(false);
     void release() noexcept(true);
 
-    void enqueue_op(htp_op_code opcode, const ggml_tensor *op);
+    void enqueue_op(const htp_opnode & node);
     void flush(bool all = true);
 
     void flush_pending(bool all = false);
@@ -1782,12 +1783,10 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
     /* .is_host          = */ ggml_backend_hexagon_repack_buffer_type_is_host,
 };
 
-// Backend session implementation
-
 struct ggml_hexagon_opbatch {
     ggml_hexagon_session*            sess;
 
-    std::vector<const ggml_tensor*>  ops;       // pointers to original ops
+    std::vector<htp_opnode>          ops;       // htp_opnode of ops
 
     std::vector<htp_buf_desc>        h_bufs;    // htp buffer descriptors
     std::vector<htp_tensor>          h_tens;    // htp tensor descriptors
@@ -1919,7 +1918,7 @@ struct ggml_hexagon_opbatch {
         return ti;
     }
 
-    bool fit_op(const struct ggml_tensor *t) const {
+    bool fit_op(const htp_opnode & node) const {
         if (n_ops >= n_ops_max ) return false;
 
         // check how much extras we will need
@@ -1939,10 +1938,10 @@ struct ggml_hexagon_opbatch {
             }
         };
 
-        for (unsigned int i=0; i < HTP_OP_MAX_INPUTS && t->src[i]; i++) {
-            fit_tensor(t->src[i]);
+        for (const auto * src : node.get_inputs()) {
+            fit_tensor(src);
         }
-        fit_tensor(t);
+        fit_tensor(node.dst());
 
         if ((extra_bufs + n_bufs) > n_bufs_max) return false;
         if ((extra_tens + n_tens) > n_tens_max) return false;
@@ -1952,29 +1951,30 @@ struct ggml_hexagon_opbatch {
     }
 
     // assumes that fit_op() was called first and returned true
-    void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
+    void add_op(const htp_opnode & node) {
         // Add new op
 
         unsigned int n = n_ops++;
         GGML_ASSERT(n_ops <= n_ops_max);
 
-        ops[n] = t;
+        ops[n] = node;
 
         htp_op_desc &o = h_ops[n];
-        memcpy(&o.params, &t->op_params, sizeof(t->op_params));
-        o.opcode = opcode;
+        memcpy(&o.params, &node.node->op_params, sizeof(node.node->op_params));
+        o.opcode = node.opcode;
         o.flags  = 0;
 
         if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
             o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
         }
 
-        ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);
+        ggml_hexagon_dump_op_exec(sess->c_name(), node, o.flags);
 
+        auto inputs = node.get_inputs();
         for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
-            o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
+            o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff;
         }
-        o.dst = add_tensor(t);
+        o.dst = add_tensor(node.dst());
     }
 };
 
@@ -1983,7 +1983,7 @@ struct ggml_hexagon_opqueue {
     ggml_hexagon_shared_buffer *shm_buf;
     size_t                      shm_blk_size;
 
-    using opvec = std::vector<const ggml_tensor*>;
+    using opvec = std::vector<htp_opnode>;
 
     std::queue<unsigned int>    done;       // completed batch ids
     std::vector<opvec>          op_cache;   // per batch op cache
@@ -2182,11 +2182,11 @@ void ggml_hexagon_session::flush_batch() {
     }
 }
 
-void ggml_hexagon_session::enqueue_op(htp_op_code opcode, const ggml_tensor *op) {
-    if (!op_batch->fit_op(op)) {
+void ggml_hexagon_session::enqueue_op(const htp_opnode & node) {
+    if (!op_batch->fit_op(node)) {
         flush_batch();
     }
-    op_batch->add_op(opcode, op);
+    op_batch->add_op(node);
 }
 
 // Flush HTP response queue i.e wait for all outstanding requests to complete
@@ -3179,10 +3179,43 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
 
     HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes);
 
+    std::vector<htp_opnode> nodes;
+    nodes.reserve(graph->n_nodes);
+
+    // Fusion
     for (int i = 0; i < graph->n_nodes; ++i) {
         ggml_tensor * n = graph->nodes[i];
-        if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
-            sess->enqueue_op(op_remap_to_htp(n), n);
+        if (!op_is_compute(n)) {
+            continue;
+        }
+
+        ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr;
+
+        htp_opnode node = {
+            /*.node =*/ n,
+            /*.fused =*/ {},
+            /*.opcode =*/ HTP_OP_INVALID
+        };
+
+        if (n->op == GGML_OP_RMS_NORM && next_node) {
+            if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
+                node.add_fused(next_node);
+                node.opcode = HTP_OP_RMS_NORM_MUL;
+                i++; // skip the fused MUL node
+            }
+        }
+
+        if (node.opcode == HTP_OP_INVALID) {
+            node.opcode = op_remap_to_htp(n);
+        }
+
+        nodes.push_back(std::move(node));
+    }
+
+    // Queue and execute
+    if (opt_opstage & HTP_OPSTAGE_QUEUE) {
+        for (const auto & node : nodes) {
+            sess->enqueue_op(node);
         }
     }
 
@@ -3201,51 +3234,7 @@ static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
     sess->flush();
 }
 
-struct node_info {
-    ggml_tensor * node;
-
-    std::vector<ggml_tensor *> fused;
-
-    ggml_op op() const {
-        return node->op;
-    }
-
-    const ggml_tensor * dst() const {
-        return fused.empty() ? node : fused.back();
-    }
-
-    const ggml_tensor * src0() const {
-        return node->src[0];
-    }
-
-    const ggml_tensor * src1() const {
-        return node->src[1];
-    }
-
-    bool is_empty() const {
-        return ggml_op_is_empty(node->op);
-    }
-
-    void add_fused(ggml_tensor * t) {
-        fused.push_back(t);
-    }
-
-    bool stackable() const {
-        switch (this->op()) {
-            case GGML_OP_MUL_MAT:
-            case GGML_OP_MUL_MAT_ID:
-                return ggml_is_quantized(this->src0()->type);
-            default:
-                return false;
-        }
-    }
-
-    bool same_input(const node_info& n) const {
-        return n.src1() == this->src1();
-    }
-};
-
-static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
+static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<htp_opnode> & nodes) {
     const int n = nodes.size();
 
     std::vector<int> res;
@@ -3299,14 +3288,14 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
 
     enum ggml_op ops[MAX_FUSE];
 
-    std::vector<node_info> nodes;
+    std::vector<htp_opnode> nodes;
     nodes.reserve(gf->n_nodes);
 
     // fuse nodes:
     // we don't want to make reorders that break fusing, so we first pack all fusable tensors
     //   and perform the reorder over the fused nodes. after the reorder is done, we unfuse
     for (int i = 0; i < n; i++) {
-        node_info node = {
+        htp_opnode node = {
             /*.node =*/gf->nodes[i],
             /*.fused =*/{},
         };
diff --git a/ggml/src/ggml-hexagon/htp-opnode.h b/ggml/src/ggml-hexagon/htp-opnode.h
new file mode 100644
index 00000000000..14b232240b4
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp-opnode.h
@@ -0,0 +1,241 @@
+#ifndef HTP_OPNODE_H
+#define HTP_OPNODE_H
+
+#define GGML_COMMON_IMPL_CPP
+#include "ggml-backend-impl.h"
+#include "ggml-common.h"
+
+#include <string>
+#include <vector>
+#include <stdio.h>
+#include "htp-ops.h"
+
+struct htp_opnode {
+    ggml_tensor * node = nullptr;
+
+    std::vector<ggml_tensor *> fused;
+
+    htp_op_code opcode = HTP_OP_INVALID;
+
+    ggml_op op() const {
+        return node->op;
+    }
+
+    const ggml_tensor * dst() const {
+        return fused.empty() ? node : fused.back();
+    }
+
+    const ggml_tensor * src0() const {
+        return node->src[0];
+    }
+
+    const ggml_tensor * src1() const {
+        return node->src[1];
+    }
+
+    bool is_empty() const {
+        return ggml_op_is_empty(node->op);
+    }
+
+    void add_fused(ggml_tensor * t) {
+        fused.push_back(t);
+    }
+
+    bool stackable() const {
+        switch (this->op()) {
+            case GGML_OP_MUL_MAT:
+            case GGML_OP_MUL_MAT_ID:
+                return ggml_is_quantized(this->src0()->type);
+            default:
+                return false;
+        }
+    }
+
+    bool same_input(const htp_opnode& n) const {
+        return n.src1() == this->src1();
+    }
+
+    std::vector<const ggml_tensor *> get_inputs() const {
+        std::vector<const ggml_tensor *> inputs;
+        std::vector<const ggml_tensor *> outputs;
+        outputs.push_back(node);
+        for (const auto * f : fused) {
+            outputs.push_back(f);
+        }
+
+        auto contains = [&](const std::vector<const ggml_tensor *> & vec, const ggml_tensor * t) {
+            for (const auto * x : vec) {
+                if (x == t) return true;
+            }
+            return false;
+        };
+
+        auto add_input = [&](const ggml_tensor * t) {
+            if (t && !contains(outputs, t) && !contains(inputs, t)) {
+                inputs.push_back(t);
+            }
+        };
+
+        for (int i = 0; i < GGML_MAX_SRC && node->src[i]; i++) {
+            add_input(node->src[i]);
+        }
+        for (const auto * f : fused) {
+            for (int i = 0; i < GGML_MAX_SRC && f->src[i]; i++) {
+                add_input(f->src[i]);
+            }
+        }
+        return inputs;
+    }
+
+    std::string op_name() const {
+        if (fused.empty()) {
+            return ggml_op_desc(node);
+        }
+        std::string name = ggml_op_desc(node);
+        for (const auto * f : fused) {
+            name += "+";
+            name += ggml_op_desc(f);
+        }
+        return name;
+    }
+};
+
+struct htp_opformat {
+    char strides[64 * GGML_MAX_SRC];
+    char dims[64 * GGML_MAX_SRC];
+    char types[16 * GGML_MAX_SRC];
+    char buffs[64 * GGML_MAX_SRC];
+    char names[64 * GGML_MAX_SRC];
+
+    int format_tensor_dims(char * str, const struct ggml_tensor * t) {
+        if (t->ne[2] == 1 && t->ne[3] == 1) {
+            return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
+        } else {
+            return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
+        }
+    }
+
+    void format_op_dims(char * str, const htp_opnode & node) {
+        char * p = str;
+        auto inputs = node.get_inputs();
+
+        if (!inputs.empty()) {
+            p += format_tensor_dims(p, inputs[0]);
+
+            for (size_t i = 1; i < inputs.size(); i++) {
+                p += sprintf(p, " x ");
+                p += format_tensor_dims(p, inputs[i]);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        char self[64];
+        format_tensor_dims(self, node.dst());
+        p += sprintf(p, "%s", self);
+    }
+
+    int format_tensor_strides(char * str, const struct ggml_tensor * t) {
+        const char * c = ggml_is_contiguous(t) ? "" : "!";
+
+        if (t->ne[2] == 1 && t->ne[3] == 1) {
+            return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
+        } else {
+            return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
+        }
+    }
+
+    void format_op_strides(char * str, const htp_opnode & node) {
+        char * p = str;
+        auto inputs = node.get_inputs();
+
+        if (!inputs.empty()) {
+            p += format_tensor_strides(p, inputs[0]);
+
+            for (size_t i = 1; i < inputs.size(); i++) {
+                p += sprintf(p, " x ");
+                p += format_tensor_strides(p, inputs[i]);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        char self[64];
+        format_tensor_strides(self, node.dst());
+        p += sprintf(p, "%s", self);
+    }
+
+    void format_op_types(char * str, const htp_opnode & node) {
+        char * p = str;
+        auto inputs = node.get_inputs();
+
+        if (!inputs.empty()) {
+            p += sprintf(p, "%s", ggml_type_name(inputs[0]->type));
+
+            for (size_t i = 1; i < inputs.size(); i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", ggml_type_name(inputs[i]->type));
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", ggml_type_name(node.dst()->type));
+    }
+
+    const char * tensor_buff_name(const struct ggml_tensor * t) {
+        if (t->buffer) {
+            return ggml_backend_buffer_name(t->buffer);
+        }
+        return "NONE";
+    }
+
+    void format_op_buffs(char * str, const htp_opnode & node) {
+        char * p = str;
+        auto inputs = node.get_inputs();
+
+        if (!inputs.empty()) {
+            p += sprintf(p, "%s", tensor_buff_name(inputs[0]));
+
+            for (size_t i = 1; i < inputs.size(); i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", tensor_buff_name(inputs[i]));
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", tensor_buff_name(node.dst()));
+    }
+
+    void format_op_names(char * str, const htp_opnode & node) {
+        char * p = str;
+        auto inputs = node.get_inputs();
+
+        if (!inputs.empty()) {
+            p += sprintf(p, "%s", inputs[0]->name);
+
+            for (size_t i = 1; i < inputs.size(); i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", inputs[i]->name);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", node.dst()->name);
+    }
+
+    void format(const htp_opnode & node) {
+        format_op_dims(dims, node);
+        format_op_strides(strides, node);
+        format_op_types(types, node);
+        format_op_buffs(buffs, node);
+        format_op_names(names, node);
+    }
+
+    htp_opformat() {}
+    htp_opformat(const htp_opnode & node) { format(node); }
+};
+
+#endif // HTP_OPNODE_H
diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
index aadc77235ba..fa85bf4ca0c 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -58,6 +58,7 @@ enum htp_op_code {
     HTP_OP_MUL_MAT,
     HTP_OP_MUL_MAT_ID,
     HTP_OP_RMS_NORM,
+    HTP_OP_RMS_NORM_MUL,
     HTP_OP_UNARY_SILU,
     HTP_OP_UNARY_GELU,
     HTP_OP_UNARY_SIGMOID,
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index 7dd90ac7d7f..623008be4e2 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -537,6 +537,7 @@ static int execute_op(struct htp_ops_context * octx) {
 
         case HTP_OP_NORM:
         case HTP_OP_RMS_NORM:
+        case HTP_OP_RMS_NORM_MUL:
         case HTP_OP_SCALE:
         case HTP_OP_SQR:
         case HTP_OP_SQRT:
diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c
index 7d0431d8ba8..770a6673211 100644
--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -23,21 +23,26 @@ struct htp_unary_context {
 
     // Precomputed values
     const uint8_t *           data_src0;
+    const uint8_t *           data_src1;            // weight/scale tensor for RMS_NORM_MUL
     uint8_t *                 data_dst;
 
     size_t                    src0_data_row_size;   // actual data bytes per row
+    size_t                    src1_data_row_size;
     size_t                    dst_data_row_size;    // actual data bytes per row
 
     size_t                    src0_row_size_aligned;
+    size_t                    src1_row_size_aligned;
     size_t                    dst_row_size_aligned;
 
     size_t                    src0_spad_half_size;
+    size_t                    src1_spad_half_size;
     size_t                    dst_spad_half_size;
 
     uint32_t                  block;
     uint32_t                  src0_nrows;
     uint32_t                  src0_nrows_per_thread;
     uint32_t                  nc;
+    bool                      broadcast_weight;
 };
 
 // Convert flat row index to DDR byte offset using the tensor's actual strides.
@@ -158,6 +163,71 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
     }
 }
 
+static void hvx_fast_rms_norm_mul_f32(const uint8_t * restrict src,
+                                      const uint8_t * restrict weight,
+                                      uint8_t * restrict dst,
+                                      const int num_elems,
+                                      float     epsilon) {
+    const HVX_Vector * restrict v_src    = (const HVX_Vector *) src;
+    const HVX_Vector * restrict v_weight = (const HVX_Vector *) weight;
+    HVX_Vector * restrict v_dst          = (HVX_Vector *) dst;
+
+    const int nvec = num_elems / VLEN_FP32;    // number of full vectors
+    const int nloe = num_elems % VLEN_FP32;    // leftover elements
+
+    // Compute sum of squares for full vectors
+    HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000);
+    HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon);
+
+    #pragma unroll(4)
+    for (int i = 0; i < nvec; i++) {
+        HVX_Vector v1 = v_src[i];
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
+        sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
+    }
+
+    // Handle tail elements using vectorized ops with masking
+    if (nloe > 0) {
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
+        HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
+        sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
+    }
+
+    // Reduce HVX sum
+    sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
+
+    HVX_Vector t_v            = hvx_vec_splat_f32((float) num_elems);
+    HVX_Vector denom_v        = hvx_vec_inverse_f32(t_v);
+    HVX_Vector mean_v         = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v);
+    HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v);
+
+    // Scale and multiply
+    HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
+
+    #pragma unroll(4)
+    for (int i = 0; i < nvec; i++) {
+        HVX_Vector v1 = v_src[i];
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
+        HVX_Vector v3 = Q6_Vsf_equals_Vqf32(v2);
+        HVX_Vector result = Q6_Vqf32_vmpy_VsfVsf(v3, v_weight[i]);
+        v_dst[i] = Q6_Vsf_equals_Vqf32(result);
+    }
+
+    // Handle tail elements using vectorized ops with masking
+    if (nloe > 0) {
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
+        HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
+        HVX_Vector v3 = Q6_Vsf_equals_Vqf32(v2);
+        HVX_Vector result = Q6_Vqf32_vmpy_VsfVsf(v3, v_weight[nvec]);
+        HVX_Vector res_v = Q6_Vsf_equals_Vqf32(result);
+
+        // Store with masking to avoid overwriting memory beyond the tensor
+        hvx_vec_store_a(&v_dst[nvec], nloe * 4, res_v);
+    }
+}
+
 static void hvx_fast_norm_f32(const uint8_t * restrict src,
                                   uint8_t * restrict dst,
                                   uint8_t * restrict pad,
@@ -269,6 +339,27 @@ static void rms_norm_f32(const float * restrict src,
     }
 }
 
+static void rms_norm_mul_f32(const float * restrict src,
+                             const float * restrict weight,
+                             float * restrict dst,
+                             const uint32_t num_rows,
+                             const uint32_t row_elems,
+                             const size_t   row_size,
+                             const size_t   weight_row_size,
+                             int32_t *      op_params,
+                             bool           broadcast_weight) {
+    float epsilon = 0.f;
+    memcpy(&epsilon, op_params, sizeof(float));
+
+    for (uint32_t ir = 0; ir < num_rows; ir++) {
+        const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
+        const uint8_t * restrict w_local   = (const uint8_t *)weight + (broadcast_weight ? 0 : ir * weight_row_size);
+        uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
+
+        hvx_fast_rms_norm_mul_f32(src_local, w_local, dst_local, row_elems, epsilon);
+    }
+}
+
 static void norm_f32(const float * restrict src,
                          float * restrict dst,
                          uint8_t * restrict spad,
@@ -598,12 +689,15 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
     t1 = HAP_perf_get_qtimer_count();
 
     const uint8_t * restrict data_src = uctx->data_src0;
+    const uint8_t * restrict data_src1 = uctx->data_src1;
     uint8_t * restrict       data_dst = uctx->data_dst;
 
     uint8_t * src0_spad_data = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    uint8_t * src1_spad_data = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
     uint8_t * dst_spad_data  = octx->dst_spad.data  + (ith * octx->dst_spad.size_per_thread);
 
     size_t src0_spad_half_size = uctx->src0_spad_half_size;
+    size_t src1_spad_half_size = uctx->src1_spad_half_size;
     size_t dst_spad_half_size  = uctx->dst_spad_half_size;
 
     // Non-contiguous tensors have gaps at dim-2/3 boundaries that a single-stride
@@ -624,6 +718,12 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
 
     dma_queue * dma_queue = octx->ctx->dma[ith];
 
+    // If weight is broadcasted, load it once per thread at the beginning of execution
+    if (htp_op == HTP_OP_RMS_NORM_MUL && uctx->broadcast_weight) {
+        dma_queue_push(dma_queue, dma_make_ptr(src1_spad_data, data_src1), uctx->src1_row_size_aligned, 0, uctx->src1_data_row_size, 1);
+        dma_queue_flush(dma_queue);
+    }
+
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; spad_idx++) {
         const uint32_t block_size = unary_block_size(ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
 
@@ -636,6 +736,14 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
         dma_queue_push(dma_queue,
             dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + src0_off),
             src0_row_size_aligned, nb01, src0_data_row_size, block_size);
+
+        if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
+            const size_t src1_off = unary_row_offset(ir, ne01, ne02, nb01, nb02, nb03);
+            dma_queue_push(dma_queue,
+                dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + src1_off),
+                uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, block_size);
+        }
+
         ir += block_size;
     }
 
@@ -644,6 +752,10 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
 
         float * dst_spad  = (float *) dma_queue_pop(dma_queue).src;
         float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
+        float * src1_spad = NULL;
+        if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
+            src1_spad = (float *) dma_queue_pop(dma_queue).dst;
+        }
 
         // Process block in VTCM
         switch (htp_op) {
@@ -653,6 +765,12 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
             case HTP_OP_RMS_NORM:
                 rms_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
                 break;
+            case HTP_OP_RMS_NORM_MUL:
+                {
+                    const float * w_ptr = uctx->broadcast_weight ? (const float *) src1_spad_data : src1_spad;
+                    rms_norm_mul_f32(src0_spad, w_ptr, dst_spad, block_size, ne0, src0_row_size_aligned, uctx->src1_row_size_aligned, op_params, uctx->broadcast_weight);
+                }
+                break;
             case HTP_OP_SCALE:
                 scale_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
                 break;
@@ -700,9 +818,16 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
             if (pref_ir < src0_end_row) {
                 const uint32_t pref_block_size = unary_block_size(pref_ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
                 const size_t src0_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03);
-            dma_queue_push(dma_queue,
-                dma_make_ptr(src0_spad, data_src + src0_pref_off),
-                src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size);
+                dma_queue_push(dma_queue,
+                    dma_make_ptr(src0_spad, data_src + src0_pref_off),
+                    src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size);
+
+                if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
+                    const size_t src1_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03);
+                    dma_queue_push(dma_queue,
+                        dma_make_ptr(src1_spad, data_src1 + src1_pref_off),
+                        uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, pref_block_size);
+                }
             }
         }
         ir += block_size;
@@ -732,6 +857,9 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
         case HTP_OP_RMS_NORM:
             op_type = "rmsnorm-f32";
             break;
+        case HTP_OP_RMS_NORM_MUL:
+            op_type = "rmsnorm-mul-f32";
+            break;
         case HTP_OP_SCALE:
             op_type = "scale-f32";
             break;
@@ -777,12 +905,44 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
     const size_t src0_row_size_aligned = hex_round_up(src0_data_row_size, VLEN);
     const size_t dst_row_size_aligned  = hex_round_up(dst_data_row_size,  VLEN);
 
+    size_t src1_data_row_size = 0;
+    size_t src1_row_size_aligned = 0;
+    bool broadcast_weight = false;
+    const struct htp_tensor * src1 = NULL;
+
+    if (octx->op == HTP_OP_RMS_NORM_MUL) {
+        src1 = octx->src[1];
+        src1_data_row_size = src1->ne[0] * sizeof(float);
+        src1_row_size_aligned = hex_round_up(src1_data_row_size, VLEN);
+        broadcast_weight = (src1->ne[1] * src1->ne[2] * src1->ne[3] == 1);
+    }
+
     // VTCM scratchpads for all tensors
     // N rows per thread, padded to HVX vector size
     // Double buffering requires 2x size per buffer
 
-    size_t spad_size_per_row   = 2 * (src0_row_size_aligned + dst_row_size_aligned);
-    size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row);
+    size_t spad_size_per_row = 0;
+    size_t vtcm_row_per_thread = 0;
+
+    if (octx->op == HTP_OP_RMS_NORM_MUL) {
+        if (broadcast_weight) {
+            size_t available_vtcm = octx->ctx->vtcm_size;
+            size_t src1_spad_total = n_threads * src1_row_size_aligned;
+            if (available_vtcm > src1_spad_total) {
+                available_vtcm -= src1_spad_total;
+            } else {
+                available_vtcm = 0;
+            }
+            spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned);
+            vtcm_row_per_thread = available_vtcm / (n_threads * spad_size_per_row);
+        } else {
+            spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned + src1_row_size_aligned);
+            vtcm_row_per_thread = (octx->ctx->vtcm_size) / (n_threads * spad_size_per_row);
+        }
+    } else {
+        spad_size_per_row   = 2 * (src0_row_size_aligned + dst_row_size_aligned);
+        vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row);
+    }
 
     // Make sure the reserved vtcm size is sufficient
     if (vtcm_row_per_thread == 0) {
@@ -797,8 +957,25 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
     octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
     octx->dst_spad.size  = n_threads * octx->dst_spad.size_per_thread;
 
+    if (octx->op == HTP_OP_RMS_NORM_MUL) {
+        if (broadcast_weight) {
+            octx->src1_spad.size_per_thread = src1_row_size_aligned;
+        } else {
+            octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread * 2;
+        }
+        octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
+    } else {
+        octx->src1_spad.size = 0;
+        octx->src1_spad.size_per_thread = 0;
+    }
+
     octx->src0_spad.data = octx->ctx->vtcm_base;
-    octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
+    if (octx->op == HTP_OP_RMS_NORM_MUL) {
+        octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+        octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
+    } else {
+        octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
+    }
 
     FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
          src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
@@ -811,19 +988,24 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
             .src0_nrows            = src0_nrows,
 
             .data_src0             = (const uint8_t *)src0->data,
+            .data_src1             = (octx->op == HTP_OP_RMS_NORM_MUL) ? (const uint8_t *)src1->data : NULL,
             .data_dst              = (uint8_t *)dst->data,
 
             .src0_data_row_size    = src0_data_row_size,
+            .src1_data_row_size    = src1_data_row_size,
             .dst_data_row_size     = dst_data_row_size,
 
             .src0_row_size_aligned = src0_row_size_aligned,
+            .src1_row_size_aligned = src1_row_size_aligned,
             .dst_row_size_aligned  = dst_row_size_aligned,
 
             .src0_spad_half_size   = octx->src0_spad.size_per_thread / 2,
+            .src1_spad_half_size   = (octx->op == HTP_OP_RMS_NORM_MUL) ? (octx->src1_spad.size_per_thread / (broadcast_weight ? 1 : 2)) : 0,
             .dst_spad_half_size    = octx->dst_spad.size_per_thread / 2,
 
             .block                 = (octx->src0_spad.size_per_thread / 2) / src0_row_size_aligned,
             .nc                    = src0->ne[0],
+            .broadcast_weight      = broadcast_weight,
         };
 
         worker_pool_run_func(octx->ctx->worker_pool, unary_job_f32_per_thread, &uctx, n_threads);
diff --git a/ggml/src/ggml-hexagon/op-desc.h b/ggml/src/ggml-hexagon/op-desc.h
deleted file mode 100644
index a1e8ddd8b97..00000000000
--- a/ggml/src/ggml-hexagon/op-desc.h
+++ /dev/null
@@ -1,153 +0,0 @@
-#ifndef OP_DESC_H
-#define OP_DESC_H
-
-#define GGML_COMMON_IMPL_CPP
-#include "ggml-backend-impl.h"
-#include "ggml-common.h"
-
-#include <string>
-#include <stdio.h>
-
-struct op_desc {
-    char strides[64 * GGML_MAX_SRC];
-    char dims[64 * GGML_MAX_SRC];
-    char types[16 * GGML_MAX_SRC];
-    char buffs[64 * GGML_MAX_SRC];
-    char names[64 * GGML_MAX_SRC];
-
-    int format_tensor_dims(char * str, const struct ggml_tensor * t) {
-        if (t->ne[2] == 1 && t->ne[3] == 1) {
-            return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
-        } else {
-            return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
-        }
-    }
-
-    void format_op_dims(char * str, const struct ggml_tensor * t) {
-        char * p = str;
-
-        // append src0 and src1 (if any)
-        if (t->src[0]) {
-            p += format_tensor_dims(p, t->src[0]);
-
-            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-                p += sprintf(p, " x ");
-                p += format_tensor_dims(p, t->src[i]);
-            }
-
-            p += sprintf(p, " -> ");
-        }
-
-        // format self dims separately for better visual alignment
-        char self[64];
-        format_tensor_dims(self, t);
-
-        p += sprintf(p, "%s", self);
-    }
-
-    int format_tensor_strides(char * str, const struct ggml_tensor * t) {
-        const char * c = ggml_is_contiguous(t) ? "" : "!";
-
-        if (t->ne[2] == 1 && t->ne[3] == 1) {
-            return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
-        } else {
-            return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
-        }
-    }
-
-    void format_op_strides(char * str, const struct ggml_tensor * t) {
-        char * p = str;
-
-        // append src0 and src1 (if any)
-        if (t->src[0]) {
-            p += format_tensor_strides(p, t->src[0]);
-
-            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-                p += sprintf(p, " x ");
-                p += format_tensor_strides(p, t->src[i]);
-            }
-
-            p += sprintf(p, " -> ");
-        }
-
-        // format self dims separately for better visual alignment
-        char self[64];
-        format_tensor_strides(self, t);
-
-        p += sprintf(p, "%s", self);
-    }
-
-    void format_op_types(char * str, const struct ggml_tensor * t) {
-        char * p = str;
-
-        // append src0 and src1 (if any)
-        if (t->src[0]) {
-            p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
-
-            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-                p += sprintf(p, " x ");
-                p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
-            }
-
-            p += sprintf(p, " -> ");
-        }
-
-        p += sprintf(p, "%s", ggml_type_name(t->type));
-    }
-
-    const char * tensor_buff_name(const struct ggml_tensor * t) {
-        if (t->buffer) {
-            return ggml_backend_buffer_name(t->buffer);
-        }
-        return "NONE";
-    }
-
-    void format_op_buffs(char * str, const struct ggml_tensor * t) {
-        char * p = str;
-
-        // append src0 and src1 (if any)
-        if (t->src[0]) {
-            p += sprintf(p, "%s", tensor_buff_name(t->src[0]));
-
-            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-                p += sprintf(p, " x ");
-                p += sprintf(p, "%s", tensor_buff_name(t->src[i]));
-            }
-
-            p += sprintf(p, " -> ");
-        }
-
-        p += sprintf(p, "%s", tensor_buff_name(t));
-    }
-
-    void format_op_names(char * str, const struct ggml_tensor * t) {
-        char * p = str;
-
-        // append src0 and src1 (if any)
-        if (t->src[0]) {
-            p += sprintf(p, "%s", t->src[0]->name);
-
-            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-                p += sprintf(p, " x ");
-                p += sprintf(p, "%s", t->src[i]->name);
-            }
-
-            p += sprintf(p, " -> ");
-        }
-
-        p += sprintf(p, "%s", t->name);
-    }
-
-    void format(const ggml_tensor * op) {
-        format_op_dims(dims, op);
-        format_op_strides(strides, op);
-        format_op_types(types, op);
-        format_op_buffs(buffs, op);
-        format_op_names(names, op);
-    }
-
-    op_desc() {}
-    op_desc(const ggml_tensor * op) { format(op); }
-};
-
-#endif // OP_DESC_H
diff --git a/scripts/snapdragon/ggml-hexagon-profile.py b/scripts/snapdragon/ggml-hexagon-profile.py
index 3edaacd2749..aa1f20dcc23 100755
--- a/scripts/snapdragon/ggml-hexagon-profile.py
+++ b/scripts/snapdragon/ggml-hexagon-profile.py
@@ -24,7 +24,7 @@
 }
 
 op_pattern = re.compile(
-    r"profile-op\s+(?P<op_name>[A-Z_0-9]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
+    r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
 )
 
 logger = logging.getLogger("ggml-hexagon-profile")

From 33c718db1fbfe834f30eef28cf206f98736fe612 Mon Sep 17 00:00:00 2001
From: Matt Corallo <649246+TheBlueMatt@users.noreply.github.com>
Date: Fri, 29 May 2026 03:30:24 +0000
Subject: [PATCH 19/50] meta : Add missing `buffer` set in allreduce fallback
 !COMPUTE clear (#23480)

Without this at least the vulkan backend will skip the `* 0` for
!COMPUTE tensors, causing corrupt output.
---
 ggml/src/ggml-backend-meta.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp
index d0d64523b4a..48b2027fac3 100644
--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
@@ -2076,6 +2076,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
             node_zero->src[0] = node;
             ggml_set_op_params_f32(node_zero, 0, 0.0f);
             node_zero->data = node->data;
+            node_zero->buffer = node->buffer;
             node_zero->flags |= GGML_TENSOR_FLAG_COMPUTE;
 
             step_cgraphs[j] = get_cgraph_aux();

From 241cbd41d21fca058c5ce900502e5d6d8d5b041b Mon Sep 17 00:00:00 2001
From: Andreas Kieslinger <47689530+aendk@users.noreply.github.com>
Date: Fri, 29 May 2026 06:46:10 +0200
Subject: [PATCH 20/50] cuda : disables launch_fattn PDL enrollment due to
 compiler bug (#23825)

---
 ggml/src/ggml-cuda/fattn-common.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index debcb6e5447..d650b5fbd0f 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -1153,8 +1153,8 @@ void launch_fattn(
 
     GGML_ASSERT(block_dim.x % warp_size == 0);
 
-    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num, block_dim, nbytes_shared, main_stream);
-    ggml_cuda_kernel_launch(fattn_kernel, launch_params,
+        // disabled PDL enrollment for now due to a compiler bug.
+        fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
         (const char *) Q->data,
         K_data,
         V_data,

From 98e480a32ec77bc494df887da3f69e53373122a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <angt@huggingface.co>
Date: Fri, 29 May 2026 07:46:11 +0200
Subject: [PATCH 21/50] app : move licences to llama-app (#23824)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
---
 CMakeLists.txt             | 13 -------------
 app/CMakeLists.txt         | 11 +++++++++++
 app/llama.cpp              | 12 ++++++++++++
 common/arg.cpp             | 12 ------------
 tools/cli/README.md        |  1 -
 tools/completion/README.md |  1 -
 tools/server/README.md     |  1 -
 7 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index edd0ea1ded0..9e7b1253c72 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -222,19 +222,6 @@ if (LLAMA_BUILD_APP)
     add_subdirectory(app)
 endif()
 
-# Automatically add all files from the 'licenses' directory
-file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
-
-foreach(FILE_PATH ${EXTRA_LICENSES})
-    get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
-    string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
-    license_add_file("${NAME}" "${FILE_PATH}")
-endforeach()
-
-if (LLAMA_BUILD_COMMON)
-    license_generate(llama-common)
-endif()
-
 #
 # install
 #
diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
index 6c53ce0e4e2..3ce503955b3 100644
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -15,6 +15,17 @@ target_link_libraries(${TARGET} PRIVATE
 )
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
+# Automatically add all files from the 'licenses' directory
+file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
+
+foreach(FILE_PATH ${EXTRA_LICENSES})
+    get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
+    string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
+    license_add_file("${NAME}" "${FILE_PATH}")
+endforeach()
+
+license_generate(${TARGET})
+
 if(LLAMA_TOOLS_INSTALL)
     install(TARGETS ${TARGET} RUNTIME)
 endif()
diff --git a/app/llama.cpp b/app/llama.cpp
index 0e932c35537..d898bfdfb24 100644
--- a/app/llama.cpp
+++ b/app/llama.cpp
@@ -5,6 +5,9 @@
 #include <string>
 #include <vector>
 
+// embedded data generated by cmake
+extern const char * LICENSES[];
+
 // visible
 int llama_server(int argc, char ** argv);
 int llama_cli(int argc, char ** argv);
@@ -21,6 +24,7 @@ static const char * progname;
 
 static int help(int argc, char ** argv);
 static int version(int argc, char ** argv);
+static int licenses(int argc, char ** argv);
 
 struct command {
     const char * name;
@@ -40,6 +44,7 @@ static const command cmds[] = {
     {"quantize",      "Quantize a model",                                   {},           true,  llama_quantize     },
     {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,  llama_perplexity   },
     {"version",       "Show version",                                       {},           false, version            },
+    {"licenses",      "Show third-party licenses",                          {"credits"},  false, licenses           },
     {"help",          "Show available commands",                            {},           false, help               },
 };
 
@@ -48,6 +53,13 @@ static int version(int argc, char ** argv) {
     return 0;
 }
 
+static int licenses(int argc, char ** argv) {
+    for (int i = 0; LICENSES[i]; ++i) {
+        printf("%s\n", LICENSES[i]);
+    }
+    return 0;
+}
+
 static int help(int argc, char ** argv) {
     const bool show_all = argc >= 2 && std::string(argv[1]) == "all";
 
diff --git a/common/arg.cpp b/common/arg.cpp
index f6fdd4fa63f..51631765fa3 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -50,8 +50,6 @@
 
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
 
-extern const char * LICENSES[];
-
 using json = nlohmann::ordered_json;
 using namespace common_arg_utils;
 
@@ -1091,16 +1089,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             exit(0);
         }
     ));
-    add_opt(common_arg(
-        {"--license"},
-        "show source code license and dependencies",
-        [](common_params &) {
-            for (int i = 0; LICENSES[i]; ++i) {
-                printf("%s\n", LICENSES[i]);
-            }
-            exit(0);
-        }
-    ));
     add_opt(common_arg(
         {"-cl", "--cache-list"},
         "show list of models in cache",
diff --git a/tools/cli/README.md b/tools/cli/README.md
index 04aef018870..f34417a835d 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -12,7 +12,6 @@
 | -------- | ----------- |
 | `-h, --help, --usage` | print usage and exit |
 | `--version` | show version and build info |
-| `--license` | show source code license and dependencies |
 | `-cl, --cache-list` | show list of models in cache |
 | `--completion-bash` | print source-able bash completion script for llama.cpp |
 | `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
diff --git a/tools/completion/README.md b/tools/completion/README.md
index e8a1287f3a1..bcaae18f376 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -95,7 +95,6 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | -------- | ----------- |
 | `-h, --help, --usage` | print usage and exit |
 | `--version` | show version and build info |
-| `--license` | show source code license and dependencies |
 | `-cl, --cache-list` | show list of models in cache |
 | `--completion-bash` | print source-able bash completion script for llama.cpp |
 | `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
diff --git a/tools/server/README.md b/tools/server/README.md
index b975088e4ae..7870e3091ea 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -33,7 +33,6 @@ For the full list of features, please refer to [server's changelog](https://gith
 | -------- | ----------- |
 | `-h, --help, --usage` | print usage and exit |
 | `--version` | show version and build info |
-| `--license` | show source code license and dependencies |
 | `-cl, --cache-list` | show list of models in cache |
 | `--completion-bash` | print source-able bash completion script for llama.cpp |
 | `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |

From eef59a764264efc025be974e0452584f584a3c59 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Fri, 29 May 2026 14:17:32 +0800
Subject: [PATCH 22/50] llama: add llm_graph_input_mtp (#23643)

* llama: add llm_graph_input_mtp

* rename input_mtp -> input_token_embd

* add TODO about mtmd embedding

* cont : clean-up

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 src/llama-graph.cpp      | 33 +++++++++++++++++++++++++++++++++
 src/llama-graph.h        | 17 +++++++++++++++++
 src/models/qwen35.cpp    | 29 +++++++++++++++++++++--------
 src/models/qwen35moe.cpp | 28 ++++++++++++++++++++--------
 4 files changed, 91 insertions(+), 16 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index fc027de8b39..1a4fa692141 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -102,6 +102,39 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
     return res;
 }
 
+void llm_graph_input_embd_h::set_input(const llama_ubatch * ubatch) {
+    const int64_t n_tokens = ubatch->n_tokens;
+
+    if (ubatch->token) {
+        ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens));
+    } else {
+        // note: mtmd embedding input goes through here
+        GGML_ASSERT(ubatch->embd);
+        GGML_ASSERT(n_embd == embd->ne[0]);
+
+        ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(h));
+    }
+
+    // TODO: extend llama_ubatch to differentiate between token embeddings and hidden states
+    //       for now, we assume that the hidden state is always provided as an embedding
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/23643
+    if (ubatch->embd) {
+        GGML_ASSERT(n_embd == h->ne[0]);
+
+        ggml_backend_tensor_set(h, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(h));
+    }
+}
+
+bool llm_graph_input_embd_h::can_reuse(const llm_graph_params & params) {
+    bool res = true;
+
+    res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
+    res &= (!params.ubatch.embd)  || (embd   && embd->ne[1]   == params.ubatch.n_tokens);
+    res &= (!params.ubatch.embd)  || (h      && h->ne[1]      == params.ubatch.n_tokens);
+
+    return res;
+}
+
 void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && pos) {
         const int64_t n_tokens = ubatch->n_tokens;
diff --git a/src/llama-graph.h b/src/llama-graph.h
index bf6778237e6..e240ade7b0c 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -121,6 +121,23 @@ class llm_graph_input_embd : public llm_graph_input_i {
     const int64_t n_embd = 0;
 };
 
+// similar to llm_graph_input_embd but with an additional hidden state input
+class llm_graph_input_embd_h : public llm_graph_input_i {
+public:
+    llm_graph_input_embd_h(int64_t n_embd) : n_embd(n_embd) {}
+    virtual ~llm_graph_input_embd_h() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    bool can_reuse(const llm_graph_params & params) override;
+
+    ggml_tensor * tokens = nullptr; // I32 [n_batch]
+    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
+    ggml_tensor * h      = nullptr; // F32 [n_embd, n_batch]
+
+    const int64_t n_embd = 0;
+};
+
 class llm_graph_input_pos : public llm_graph_input_i {
 public:
     llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
index 04ecc18fcdc..ba63ae441df 100644
--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@@ -508,28 +508,41 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     int sections[4];
     std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
 
-    auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
+    // TODO: extract in a common llm_graph_context::build_inp_embd_h()
+    auto inp = std::make_unique<llm_graph_input_embd_h>(hparams.n_embd);
 
     inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
     ggml_set_input(inp->tokens);
 
-    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens);
     ggml_set_input(inp->embd);
-    ggml_set_name(inp->embd, "mtp_h_input");
 
-    ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
+    // TODO: make static using `ggml_build_forward_select()`
+    //       see llm_graph_context::build_inp_embd() for reference
+    ggml_tensor * tok_embd;
+    if (ubatch.token) {
+        ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
 
-    ggml_tensor * h_input  = inp->embd;
-    ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+        tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+    } else {
+        tok_embd = inp->embd;
+    }
     cb(tok_embd, "mtp_tok_embd", il);
 
+    inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    ggml_set_input(inp->h);
+    ggml_set_name(inp->h, "mtp_h_input");
+
+    ggml_tensor * h_embd = inp->h;
+
     res->add_input(std::move(inp));
 
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
-    auto * inp_attn           = build_attn_inp_kv();
 
-    ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
+    auto * inp_attn = build_attn_inp_kv();
+
+    ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
     cb(h_norm, "mtp_hnorm", il);
 
     ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
index dc24f6ed537..4f87d55d911 100644
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -571,29 +571,41 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     int sections[4];
     std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
 
-    auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
+    // TODO: extract in a common llm_graph_context::build_inp_embd_h()
+    auto inp = std::make_unique<llm_graph_input_embd_h>(hparams.n_embd);
 
     inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
     ggml_set_input(inp->tokens);
 
-    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens);
     ggml_set_input(inp->embd);
-    ggml_set_name(inp->embd, "mtp_h_input");
 
-    ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
+    // TODO: make static using `ggml_build_forward_select()`
+    //       see llm_graph_context::build_inp_embd() for reference
+    ggml_tensor * tok_embd;
+    if (ubatch.token) {
+        ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
 
-    ggml_tensor * h_input  = inp->embd;
-    ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+        tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+    } else {
+        tok_embd = inp->embd;
+    }
     cb(tok_embd, "mtp_tok_embd", il);
 
+    inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    ggml_set_input(inp->h);
+    ggml_set_name(inp->h, "mtp_h_input");
+
+    ggml_tensor * h_embd = inp->h;
+
     res->add_input(std::move(inp));
 
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
-    auto * inp_attn           = build_attn_inp_kv();
 
+    auto * inp_attn = build_attn_inp_kv();
 
-    ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
+    ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
     cb(h_norm, "mtp_hnorm", il);
 
     ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);

From b000431a0bec02acc4173f5937313ab72fc476c5 Mon Sep 17 00:00:00 2001
From: Omid Azizi <oazizi@gimletlabs.ai>
Date: Thu, 28 May 2026 23:21:37 -0700
Subject: [PATCH 23/50] ngram-mod : Add missing include (#23857)

[no release]

Signed-off-by: Omid Azizi <oazizi@gimletlabs.ai>
---
 common/ngram-mod.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp
index 76f7257f611..1b5a09a5eb6 100644
--- a/common/ngram-mod.cpp
+++ b/common/ngram-mod.cpp
@@ -1,5 +1,7 @@
 #include "ngram-mod.h"
 
+#include <algorithm>
+
 //
 // common_ngram_mod
 //

From ea02bc37f50b73bbc73f2fe41ee91863b390d781 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 29 May 2026 09:46:12 +0300
Subject: [PATCH 24/50] ggml : bump version to 0.13.1 (ggml/1523)

---
 ggml/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index f542f18b6d4..dc8899b46ef 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 13)
-set(GGML_VERSION_PATCH 0)
+set(GGML_VERSION_PATCH 1)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

From fe12e422ad40e6845f9f422fc35c0f9b24441d8b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 29 May 2026 09:53:41 +0300
Subject: [PATCH 25/50] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index a4f87b2b9ae..538ef80bc7a 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-e705c5fed490514458bdd2eaddc43bd098fcce9b
+1e33fed33e87c43aa4c4078e2a9c239d4c1f1bd3

From 031ddb2e08962837c899374297da75f91cc0157d Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Fri, 29 May 2026 15:44:43 +0800
Subject: [PATCH 26/50] llama: use f16 mask for FA to save VRAM (#23764)

* llama: use f16 mask for FA

* review: add llama_cast + formatting

* simplify
---
 src/llama-graph.cpp    | 112 +++++++++++++++++++++++------------------
 src/llama-graph.h      |  28 +++++------
 src/llama-impl.h       |  14 ++++++
 src/llama-kv-cache.cpp |  53 +++++++++++--------
 4 files changed, 122 insertions(+), 85 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 1a4fa692141..9ce4c4479d5 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -29,7 +29,10 @@ static ggml_tensor * build_attn_inp_kq_mask(
     const auto n_tokens = ubatch.n_tokens;
     const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
 
-    ggml_tensor * res = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+    // flash attention requires an f16 mask
+    const auto type = cparams.flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+    ggml_tensor * res = ggml_new_tensor_4d(ctx, type, n_kv, n_tokens/n_stream, 1, n_stream);
     ggml_set_input(res);
     ggml_set_name(res, "attn_inp_kq_mask");
 
@@ -381,7 +384,8 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
+template <typename T>
+static void print_mask(const T * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
     LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
     const char * swa_type_str = "unknown";
 
@@ -405,7 +409,7 @@ static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64
     for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
         LLAMA_LOG_DEBUG(" %2d ", i);
         for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
-            float val = data[i * n_kv + j];
+            float val = llama_cast<float>(data[i * n_kv + j]);
             if (val == -INFINITY) {
                 LLAMA_LOG_DEBUG(" ∞");
             } else {
@@ -420,7 +424,10 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
     const int64_t n_kv     = ubatch->n_tokens;
     const int64_t n_tokens = ubatch->n_tokens;
 
-    const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
+    const auto fill_mask = [&](auto * data, int64_t ne, int n_swa, llama_swa_type swa_type) {
+        using T = std::remove_reference_t<decltype(*data)>;
+        std::fill(data, data + ne, llama_cast<T>(-INFINITY));
+
         for (int i1 = 0; i1 < n_tokens; ++i1) {
             const llama_seq_id s1 = ubatch->seq_id[i1][0];
             const llama_pos    p1 = ubatch->pos[i1];
@@ -446,38 +453,30 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
                     continue;
                 }
 
-                data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
+                data[idst + i0] = llama_cast<T>(hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f);
             }
         }
-    };
-
-    {
-        GGML_ASSERT(self_kq_mask);
-        GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
-
-        float * data = (float *) self_kq_mask->data;
-
-        std::fill(data, data + ggml_nelements(self_kq_mask), -INFINITY);
-
-        fill_mask(data, 0, LLAMA_SWA_TYPE_NONE);
 
         if (debug) {
-            print_mask(data, n_tokens, n_kv, 0, LLAMA_SWA_TYPE_NONE);
+            print_mask(data, n_tokens, n_kv, n_swa, swa_type);
         }
+    };
+
+    GGML_ASSERT(self_kq_mask);
+    GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
+    if (self_kq_mask->type == GGML_TYPE_F16) {
+        fill_mask((ggml_fp16_t *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE);
+    } else {
+        fill_mask((float       *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE);
     }
 
     if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
         GGML_ASSERT(self_kq_mask_swa);
         GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
-
-        float * data = (float *) self_kq_mask_swa->data;
-
-        std::fill(data, data + ggml_nelements(self_kq_mask_swa), -INFINITY);
-
-        fill_mask(data, hparams.n_swa, hparams.swa_type);
-
-        if (debug) {
-            print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
+        if (self_kq_mask_swa->type == GGML_TYPE_F16) {
+            fill_mask((ggml_fp16_t *) self_kq_mask_swa->data, ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type);
+        } else {
+            fill_mask((float       *) self_kq_mask_swa->data, ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type);
         }
     }
 }
@@ -601,23 +600,30 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
     GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer));
     GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing
 
-    float * data = (float *) cross_kq_mask->data;
-
-    for (int i = 0; i < n_tokens; ++i) {
-        GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first");
-        for (int j = 0; j < n_enc; ++j) {
-            float f = -INFINITY;
+    const auto fill_mask = [&](auto * data) {
+        using T = std::remove_reference_t<decltype(*data)>;
+        for (int i = 0; i < n_tokens; ++i) {
+            GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first");
+            for (int j = 0; j < n_enc; ++j) {
+                float f = -INFINITY;
 
-            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
-                const llama_seq_id seq_id = ubatch->seq_id[i][s];
+                for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+                    const llama_seq_id seq_id = ubatch->seq_id[i][s];
 
-                if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
-                    f = 0.0f;
+                    if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
+                        f = 0.0f;
+                    }
                 }
-            }
 
-            data[i*n_enc + j] = f;
+                data[i*n_enc + j] = llama_cast<T>(f);
+            }
         }
+    };
+
+    if (cross_kq_mask->type == GGML_TYPE_F16) {
+        fill_mask((ggml_fp16_t *) cross_kq_mask->data);
+    } else {
+        fill_mask((float *) cross_kq_mask->data);
     }
 }
 
@@ -2121,17 +2127,20 @@ ggml_tensor * llm_graph_context::build_attn_mha(
 llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() const {
     auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
 
+    // flash attention requires an f16 mask
+    const auto type_mask = cparams.flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
     // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
+    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, type_mask, n_tokens, n_tokens, 1, 1);
     ggml_set_input(inp->self_kq_mask);
 
-    inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+    inp->self_kq_mask_cnv = inp->self_kq_mask;
 
     if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, type_mask, n_tokens, n_tokens, 1, 1);
         ggml_set_input(inp->self_kq_mask_swa);
 
-        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+        inp->self_kq_mask_swa_cnv = inp->self_kq_mask_swa;
     } else {
         inp->self_kq_mask_swa     = nullptr;
         inp->self_kq_mask_swa_cnv = nullptr;
@@ -2208,7 +2217,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
         inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
 
         inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams);
-        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+        inp->self_kq_mask_cnv = inp->self_kq_mask;
     }
 
     inp->self_k_rot = mctx_cur->build_input_k_rot(ctx0);
@@ -2315,7 +2324,7 @@ static std::unique_ptr<llm_graph_input_attn_k> build_attn_inp_k_impl(
         inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
 
         inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams);
-        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+        inp->self_kq_mask_cnv = inp->self_kq_mask;
     }
 
     return inp;
@@ -2479,10 +2488,13 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
 
     const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
 
-    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
+    // flash attention requires an f16 mask
+    const auto type_mask = cparams.flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, type_mask, n_enc, n_tokens, 1, 1);
     ggml_set_input(inp->cross_kq_mask);
 
-    inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
+    inp->cross_kq_mask_cnv = inp->cross_kq_mask;
 
     return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
 }
@@ -2543,7 +2555,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
         inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
 
         inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur->get_base(), ubatch, cparams);
-        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+        inp->self_kq_mask_cnv = inp->self_kq_mask;
     }
 
     {
@@ -2553,7 +2565,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
         inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
 
         inp->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, mctx_cur->get_swa(), ubatch, cparams);
-        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+        inp->self_kq_mask_swa_cnv = inp->self_kq_mask_swa;
     }
 
     inp->self_k_rot = mctx_cur->get_base()->build_input_k_rot(ctx0);
@@ -2722,7 +2734,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa()
         inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
 
         inp_attn->self_kq_mask = build_attn_inp_kq_mask(ctx0, attn_ctx->get_base(), ubatch, cparams);
-        inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
+        inp_attn->self_kq_mask_cnv = inp_attn->self_kq_mask;
     }
 
     {
@@ -2730,7 +2742,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa()
         inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
 
         inp_attn->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, attn_ctx->get_swa(), ubatch, cparams);
-        inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
+        inp_attn->self_kq_mask_swa_cnv = inp_attn->self_kq_mask_swa;
     }
 
     auto inp = std::make_unique<llm_graph_input_mem_hybrid_iswa>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index e240ade7b0c..9f4816959fe 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -291,10 +291,10 @@ class llm_graph_input_attn_no_cache : public llm_graph_input_i {
     ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
 
     // n_tokens == n_batch
-    ggml_tensor * self_kq_mask         = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_tokens, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask         = nullptr; // F32/F16 [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv     = nullptr; //         [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa     = nullptr; // F32/F16 [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //         [n_tokens, n_batch/n_stream, 1, n_stream]
 
     const llama_hparams hparams;
     const llama_cparams cparams;
@@ -324,8 +324,8 @@ class llm_graph_input_attn_kv : public llm_graph_input_i {
     ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
     ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
 
-    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask     = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
 
     // note: assumes v_rot^2 == I
     ggml_tensor * self_k_rot = nullptr;
@@ -364,8 +364,8 @@ class llm_graph_input_attn_k : public llm_graph_input_i {
 
     ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
 
-    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask     = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
 
     const llama_hparams hparams;
     const llama_cparams cparams;
@@ -402,10 +402,10 @@ class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
     ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch]
     ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
 
-    ggml_tensor * self_kq_mask         = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask         = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv     = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa     = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
 
     ggml_tensor * self_k_rot = nullptr;
     ggml_tensor * self_v_rot = nullptr;
@@ -428,8 +428,8 @@ class llm_graph_input_attn_cross : public llm_graph_input_i {
 
     ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; }
 
-    ggml_tensor * cross_kq_mask     = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
-    ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
+    ggml_tensor * cross_kq_mask     = nullptr; // F32/F16 [n_outputs_enc, n_batch, 1, 1]
+    ggml_tensor * cross_kq_mask_cnv = nullptr; // F32/F16 [n_outputs_enc, n_batch, 1, 1]
 
     const llama_cross * cross = nullptr;
 };
diff --git a/src/llama-impl.h b/src/llama-impl.h
index e4f35c8e53d..7923c3f7ed5 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -3,6 +3,7 @@
 #include "ggml.h" // for ggml_log_level
 
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #ifdef __GNUC__
@@ -40,6 +41,19 @@ struct no_init {
     no_init() = default;
 };
 
+template <typename dst_t, typename src_t>
+static inline dst_t llama_cast(src_t v) {
+    if constexpr (std::is_same_v<src_t, dst_t>) {
+        return v;
+    } else if constexpr (std::is_same_v<src_t, ggml_fp16_t> && std::is_same_v<dst_t, float>) {
+        return ggml_fp16_to_fp32(v);
+    } else if constexpr (std::is_same_v<src_t, float> && std::is_same_v<dst_t, ggml_fp16_t>) {
+        return ggml_fp32_to_fp16(v);
+    } else {
+        static_assert(std::is_same_v<dst_t, void>, "unsupported type combination");
+    }
+}
+
 struct time_meas {
     time_meas(int64_t & t_acc, bool disable = false);
     ~time_meas();
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a49a055a630..2356d612b1b 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1430,8 +1430,8 @@ struct args_set_input_kq_mask {
     int64_t n_tps;
 };
 
-template<bool causal, bool swa, bool is_2d, bool alibi>
-static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+template<typename T, bool causal, bool swa, bool is_2d, bool alibi>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data) {
   //const auto & hparams = args.hparams;
     const auto & ubatch  = args.ubatch;
 
@@ -1445,6 +1445,9 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float *
     const int64_t n_stream = args.n_stream;
     const int64_t n_tps    = args.n_tps;
 
+    const T mask_keep = llama_cast<T>(0.0f);
+    const T mask_drop = llama_cast<T>(-INFINITY);
+
     // the min position in the batch for each sequence
     llama_pos seq_pos_min[LLAMA_MAX_SEQ];
     std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
@@ -1563,46 +1566,55 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float *
                 }
 
                 if (alibi) {
-                    data[idst + j] = -std::abs(p0 - p1);
+                    data[idst + j] = llama_cast<T>(static_cast<float>(-std::abs(p0 - p1)));
                 } else {
-                    data[idst + j] = 0.0f;
+                    data[idst + j] = mask_keep;
                 }
 
                 continue;
 skip:
-                data[idst + j] = -INFINITY;
+                data[idst + j] = mask_drop;
             }
         }
     }
 }
 
-template<bool causal, bool swa, bool is_2d>
-static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+template<typename T, bool causal, bool swa, bool is_2d>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data) {
     const bool alibi = args.hparams.use_alibi;
     if (alibi) {
-        set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
+        set_input_kq_mask_impl<T, causal, swa, is_2d, true> (args, data);
     } else {
-        set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
+        set_input_kq_mask_impl<T, causal, swa, is_2d, false>(args, data);
     }
 }
 
-template<bool causal, bool swa>
-static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+template<typename T, bool causal, bool swa>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data) {
     const bool is_2d = args.ubatch->is_pos_2d();
     if (is_2d) {
-        set_input_kq_mask_impl<causal, swa, true> (args, data);
+        set_input_kq_mask_impl<T, causal, swa, true> (args, data);
     } else {
-        set_input_kq_mask_impl<causal, swa, false>(args, data);
+        set_input_kq_mask_impl<T, causal, swa, false>(args, data);
     }
 }
 
-template<bool causal>
-static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+template<typename T, bool causal>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data) {
     const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
     if (swa) {
-        set_input_kq_mask_impl<causal, true> (args, data);
+        set_input_kq_mask_impl<T, causal, true> (args, data);
     } else {
-        set_input_kq_mask_impl<causal, false>(args, data);
+        set_input_kq_mask_impl<T, causal, false>(args, data);
+    }
+}
+
+template<typename T>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data, bool causal_attn) {
+    if (causal_attn) {
+        set_input_kq_mask_impl<T, true> (args, data);
+    } else {
+        set_input_kq_mask_impl<T, false>(args, data);
     }
 }
 
@@ -1610,7 +1622,6 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
     const uint32_t n_tokens = ubatch->n_tokens;
 
     GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-    float * data = (float *) dst->data;
 
     const int64_t n_kv     = dst->ne[0];
     const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
@@ -1634,10 +1645,10 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
         /*.n_tps            =*/ n_tps,
     };
 
-    if (causal_attn) {
-        set_input_kq_mask_impl<true> (args, data);
+    if (dst->type == GGML_TYPE_F16) {
+        set_input_kq_mask_impl<ggml_fp16_t>(args, (ggml_fp16_t *) dst->data, causal_attn);
     } else {
-        set_input_kq_mask_impl<false>(args, data);
+        set_input_kq_mask_impl<float>(args, (float *) dst->data, causal_attn);
     }
 
     //const int64_t t_end = ggml_time_us();

From 1f0aa2a6964091c33827b1daae1e2b74118c6fa7 Mon Sep 17 00:00:00 2001
From: fairydreaming <166155368+fairydreaming@users.noreply.github.com>
Date: Fri, 29 May 2026 10:15:17 +0200
Subject: [PATCH 27/50] model : support for DeepseekV32ForCausalLM with generic
 DeepSeek Sparse Attention (DSA) implementation (#23346)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* llama : support DeepSeek V3.2 model family (with DSA lightning indexer)

* convert : handle DeepseekV32ForCausalLM architecture

* ggml : support for f16 GGML_OP_FILL

* memory : separate hparams argument in llama_kv_cache constructor

* memory : add llama_kv_cache_dsa memory (KV cache + lightning indexer cache)

* llama : support for LLM_ARCH_DEEPSEEK32

* model : llama_model_deepseek32 implementation

* model : merge two scale operations into one in DSA lightning indexer implementation

* chore : remove unused code

* model : support NVFP4 in DeepSeek V3.2

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* memory : refactoring TODO

Co-authored-by: ggerganov <ggerganov@users.noreply.github.com>

---------

Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Co-authored-by: ggerganov <ggerganov@users.noreply.github.com>
---
 conversion/__init__.py      |   1 +
 conversion/base.py          |   2 +
 conversion/deepseek.py      |  29 +++
 ggml/src/ggml-cpu/ops.cpp   |  36 ++-
 ggml/src/ggml.c             |   2 +-
 gguf-py/gguf/constants.py   |  46 ++++
 src/CMakeLists.txt          |   1 +
 src/llama-arch.cpp          |   2 +
 src/llama-arch.h            |   1 +
 src/llama-graph.cpp         | 129 +++++++++
 src/llama-graph.h           |  56 ++++
 src/llama-kv-cache-dsa.cpp  | 261 +++++++++++++++++++
 src/llama-kv-cache-dsa.h    | 138 ++++++++++
 src/llama-kv-cache-iswa.cpp |   4 +-
 src/llama-kv-cache.cpp      |  10 +-
 src/llama-kv-cache.h        |   4 +
 src/llama-memory-hybrid.cpp |   1 +
 src/llama-model.cpp         |  25 +-
 src/llama-model.h           |   1 +
 src/models/deepseek32.cpp   | 503 ++++++++++++++++++++++++++++++++++++
 src/models/models.h         |  13 +
 tests/test-llama-archs.cpp  |   3 +
 22 files changed, 1261 insertions(+), 7 deletions(-)
 create mode 100644 src/llama-kv-cache-dsa.cpp
 create mode 100644 src/llama-kv-cache-dsa.h
 create mode 100644 src/models/deepseek32.cpp

diff --git a/conversion/__init__.py b/conversion/__init__.py
index 2a87bd75b44..89e05d691d4 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -47,6 +47,7 @@
     "DeepseekForCausalLM": "deepseek",
     "DeepseekV2ForCausalLM": "deepseek",
     "DeepseekV3ForCausalLM": "deepseek",
+    "DeepseekV32ForCausalLM": "deepseek",
     "DistilBertForMaskedLM": "bert",
     "DistilBertForSequenceClassification": "bert",
     "DistilBertModel": "bert",
diff --git a/conversion/base.py b/conversion/base.py
index 9cddd1340f7..f861f8b5296 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -915,6 +915,8 @@ def load():
                             gguf.MODEL_TENSOR.SSM_CONV1D_Q,
                             gguf.MODEL_TENSOR.SSM_CONV1D_K,
                             gguf.MODEL_TENSOR.SSM_CONV1D_V,
+                            # DSA indexer weights should be F32
+                            gguf.MODEL_TENSOR.INDEXER_PROJ,
                         )
                     )
                     or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
diff --git a/conversion/deepseek.py b/conversion/deepseek.py
index e149fcbf752..af18a25a8b5 100644
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@@ -386,3 +386,32 @@ def prepare_tensors(self):
             experts = [k for d in self._experts for k in d.keys()]
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@ModelBase.register("DeepseekV32ForCausalLM")
+class DeepseekV32Model(DeepseekV2Model):
+    model_arch = gguf.MODEL_ARCH.DEEPSEEK32
+    skip_mtp = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
+        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+
+    def set_vocab(self):
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+        assert getattr(tokenizer, "add_bos_token", False), "Change value of add_bos_token to true in tokenizer_config.json file."
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # NextN/MTP prediction layers
+        if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
+            self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
+
+        # DSA indexer parameters
+        self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"])
+        self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"])
+        self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"])
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 7485ba4fc86..dc73696ad9f 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -2235,8 +2235,42 @@ static void ggml_compute_forward_fill_f32(const ggml_compute_params * params, gg
     }
 }
 
+static void ggml_compute_forward_fill_f16(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_fp16_t c = GGML_CPU_FP32_TO_FP16(ggml_get_op_params_f32(dst, 0));
+
+    GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
+    GGML_TENSOR_LOCALS(size_t,  nb, dst, nb);
+
+    const auto [ir0, ir1] = get_thread_range(params, dst);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne2*ne1);
+        const int64_t i02 = (ir - i03*ne2*ne1)/ne1;
+        const int64_t i01 = (ir - i03*ne2*ne1 - i02*ne1);
+
+        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1);
+
+        ggml_vec_set_f16(ne0, dst_ptr, c);
+    }
+}
+
 void ggml_compute_forward_fill(const ggml_compute_params * params, ggml_tensor * dst) {
-    ggml_compute_forward_fill_f32(params, dst);
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_fill_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_fill_f16(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("unsupported type for ggml_compute_forward_fill: %s", ggml_type_name(src0->type));
+            }
+    }
 }
 
 // ggml_compute_tri
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 476c3079795..8815c67d8bc 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5223,7 +5223,7 @@ static struct ggml_tensor * ggml_fill_impl(
     struct ggml_tensor  * a,
     float                 c,
     bool                  inplace) {
-    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16);
     GGML_ASSERT(ggml_is_contiguous(a));
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 0189f6f03c5..92578490cb3 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -451,6 +451,7 @@ class MODEL_ARCH(IntEnum):
     DEEPSEEK         = auto()
     DEEPSEEK2        = auto()
     DEEPSEEK2OCR     = auto()
+    DEEPSEEK32       = auto()
     CHATGLM          = auto()
     GLM4             = auto()
     GLM4_MOE         = auto()
@@ -967,6 +968,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.DEEPSEEK:         "deepseek",
     MODEL_ARCH.DEEPSEEK2:        "deepseek2",
     MODEL_ARCH.DEEPSEEK2OCR:     "deepseek2-ocr",
+    MODEL_ARCH.DEEPSEEK32:       "deepseek32",
     MODEL_ARCH.CHATGLM:          "chatglm",
     MODEL_ARCH.GLM4:             "glm4",
     MODEL_ARCH.GLM4_MOE:         "glm4moe",
@@ -2930,6 +2932,46 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_UP_SHEXP,
         MODEL_TENSOR.FFN_EXP_PROBS_B,
     ],
+    MODEL_ARCH.DEEPSEEK32: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_A,
+        MODEL_TENSOR.ATTN_Q_B,
+        MODEL_TENSOR.ATTN_KV_A_MQA,
+        MODEL_TENSOR.ATTN_K_B,
+        MODEL_TENSOR.ATTN_V_B,
+        MODEL_TENSOR.ATTN_Q_A_NORM,
+        MODEL_TENSOR.ATTN_KV_A_NORM,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
+        MODEL_TENSOR.INDEXER_K_NORM,
+        MODEL_TENSOR.INDEXER_PROJ,
+        MODEL_TENSOR.INDEXER_ATTN_K,
+        MODEL_TENSOR.INDEXER_ATTN_Q_B,
+        # NextN/MTP tensors - preserved but unused
+        MODEL_TENSOR.NEXTN_EH_PROJ,
+        MODEL_TENSOR.NEXTN_EMBED_TOKENS,
+        MODEL_TENSOR.NEXTN_ENORM,
+        MODEL_TENSOR.NEXTN_HNORM,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
+    ],
     MODEL_ARCH.ERNIE4_5_MOE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -4077,6 +4119,10 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
     ],
+    MODEL_ARCH.DEEPSEEK32: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
     MODEL_ARCH.CHATGLM: [
         MODEL_TENSOR.ROPE_FREQS,
     ],
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7b1fcfca0ad..d15ccfd99f1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -24,6 +24,7 @@ add_library(llama
             llama-io.cpp
             llama-kv-cache.cpp
             llama-kv-cache-iswa.cpp
+            llama-kv-cache-dsa.cpp
             llama-memory.cpp
             llama-memory-hybrid.cpp
             llama-memory-hybrid-iswa.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index e95ba6daac1..b485ac02e75 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -75,6 +75,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_DEEPSEEK,         "deepseek"         },
     { LLM_ARCH_DEEPSEEK2,        "deepseek2"        },
     { LLM_ARCH_DEEPSEEK2OCR,     "deepseek2-ocr"    },
+    { LLM_ARCH_DEEPSEEK32,       "deepseek32"       },
     { LLM_ARCH_CHATGLM,          "chatglm"          },
     { LLM_ARCH_GLM4,             "glm4"             },
     { LLM_ARCH_GLM4_MOE,         "glm4moe"          },
@@ -904,6 +905,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
         case LLM_ARCH_OLMO2:
         case LLM_ARCH_OLMOE:
         case LLM_ARCH_DEEPSEEK2:
+        case LLM_ARCH_DEEPSEEK32:
         case LLM_ARCH_GLM_DSA:
         case LLM_ARCH_BITNET:
         case LLM_ARCH_T5:
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 7c1dcc4d6c2..b59043e408f 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -79,6 +79,7 @@ enum llm_arch {
     LLM_ARCH_DEEPSEEK,
     LLM_ARCH_DEEPSEEK2,
     LLM_ARCH_DEEPSEEK2OCR,
+    LLM_ARCH_DEEPSEEK32,
     LLM_ARCH_CHATGLM,
     LLM_ARCH_GLM4,
     LLM_ARCH_GLM4_MOE,
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 9ce4c4479d5..5bca8230b9b 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -7,6 +7,7 @@
 
 #include "llama-kv-cache.h"
 #include "llama-kv-cache-iswa.h"
+#include "llama-kv-cache-dsa.h"
 #include "llama-memory-hybrid.h"
 #include "llama-memory-hybrid-iswa.h"
 #include "llama-memory-recurrent.h"
@@ -531,6 +532,34 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
     return res;
 }
 
+void llm_graph_input_attn_k_dsa::set_input(const llama_ubatch * ubatch) {
+    mctx->get_mla()->set_input_k_idxs(self_k_idxs_mla, ubatch);
+
+    mctx->get_mla()->set_input_kq_mask(self_kq_mask_mla, ubatch, cparams.causal_attn);
+
+    mctx->get_lid()->set_input_k_idxs(self_k_idxs_lid, ubatch);
+
+    mctx->get_lid()->set_input_kq_mask(self_kq_mask_lid, ubatch, cparams.causal_attn);
+
+    mctx->get_lid()->set_input_k_rot(self_k_rot_lid);
+}
+
+bool llm_graph_input_attn_k_dsa::can_reuse(const llm_graph_params & params) {
+    const auto * mctx = static_cast<const llama_kv_cache_dsa_context *>(params.mctx);
+
+    this->mctx = mctx;
+
+    bool res = true;
+
+    res &= self_k_idxs_mla->ne[0] == params.ubatch.n_tokens;
+    res &= self_k_idxs_lid->ne[0] == params.ubatch.n_tokens;
+
+    res &= can_reuse_kq_mask(self_kq_mask_mla, mctx->get_mla(), params.ubatch, params.cparams);
+    res &= can_reuse_kq_mask(self_kq_mask_lid, mctx->get_lid(), params.ubatch, params.cparams);
+
+    return res;
+}
+
 void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
     // base tensors may not be allocated if there are no non-SWA attention layers
     if (self_k_idxs && self_k_idxs->buffer) {
@@ -2396,6 +2425,82 @@ ggml_tensor * llm_graph_context::build_attn(
     return cur;
 }
 
+ggml_tensor * llm_graph_context::build_attn(
+        llm_graph_input_attn_k_dsa * inp,
+        ggml_tensor * wo,
+        ggml_tensor * wo_b,
+        ggml_tensor * wo_s,
+        ggml_tensor * q_cur,
+        ggml_tensor * k_cur,
+        ggml_tensor * v_cur,
+        ggml_tensor * kq_b,
+        ggml_tensor * sinks,
+        ggml_tensor * v_mla,
+        ggml_tensor * top_k,
+            float     kq_scale,
+            int       il) const {
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    // expand k later to enable rope fusion which directly writes into k-v cache
+    ggml_build_forward_expand(gf, q_cur);
+    ggml_build_forward_expand(gf, v_cur);
+    ggml_build_forward_expand(gf, k_cur);
+
+    const auto * mctx_cur = inp->mctx->get_mla();
+
+    // store to KV cache
+    {
+        const auto & k_idxs = inp->get_k_idxs_mla();
+
+        ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
+    }
+
+    const auto & kq_mask = inp->get_kq_mask_mla();
+
+    // prepare new kq mask - starts filled with -INFINITY
+    ggml_tensor * kq_mask_all = ggml_fill(ctx0, kq_mask, -INFINITY);
+
+    // reshape KQ mask into tensor with rows of size 1:
+    // [n_kv, n_batch, 1, n_stream] -> [1, n_kv, n_batch, n_stream]
+    kq_mask_all = ggml_view_4d(ctx0, kq_mask_all, 1, kq_mask_all->ne[0], kq_mask_all->ne[1], kq_mask_all->ne[3], kq_mask_all->nb[0], kq_mask_all->nb[1], kq_mask_all->nb[2], 0);
+
+    // reshape top_k indices: [n_top_k, n_batch, 1, n_stream] -> [n_top_k, n_batch, n_stream, 1]
+    ggml_tensor * top_k_3d = ggml_view_4d(ctx0, top_k, top_k->ne[0], top_k->ne[1], top_k->ne[3], 1, top_k->nb[1], top_k->nb[2], top_k->ne[3]*top_k->nb[3], 0);
+
+    // prepare zero-filled tensor with rows of size 1: [1, n_top_k, n_batch, n_stream]
+    // this will be our source of zero values for unmasking top k mask elements
+    ggml_tensor * zeros = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1, top_k_3d->ne[0], top_k_3d->ne[1], top_k_3d->ne[2]);
+    zeros = ggml_fill(ctx0, zeros, 0.0f);
+
+    // modify KQ mask by unmasking elements that are in top_k indices
+    // ggml_set_rows([1, n_kv, n_batch, n_stream], [1, n_top_k, n_batch, n_stream], [n_top_k, n_batch, n_stream, 1])
+    ggml_tensor * kq_mask_top_k = ggml_set_rows(ctx0, kq_mask_all, zeros, top_k_3d);
+
+    // reshape to restore the original shape of KQ mask:
+    // [1, n_kv, n_batch, n_stream] -> [n_kv, n_batch, 1, n_stream]
+    kq_mask_top_k = ggml_view_4d(ctx0, kq_mask_top_k, kq_mask_top_k->ne[1], kq_mask_top_k->ne[2], 1, kq_mask_top_k->ne[3], kq_mask_top_k->nb[2], kq_mask_top_k->nb[3], kq_mask_top_k->nb[3], 0);
+
+    // combine with the original kq mask
+    kq_mask_top_k = ggml_add(ctx0, kq_mask_top_k, kq_mask);
+
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
+    ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
+
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_top_k, sinks, v_mla, kq_scale, il);
+    cb(cur, "kqv_out", il);
+
+    if (wo) {
+        cur = build_lora_mm(wo, cur, wo_s);
+    }
+
+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
+    }
+
+    return cur;
+}
+
 ggml_tensor * llm_graph_context::build_attn(
         llm_graph_input_attn_kv_iswa * inp,
         ggml_tensor * wo,
@@ -2542,6 +2647,30 @@ ggml_tensor * llm_graph_context::build_attn(
     return cur;
 }
 
+llm_graph_input_attn_k_dsa * llm_graph_context::build_attn_inp_k_dsa() const {
+    const auto * mctx_cur = static_cast<const llama_kv_cache_dsa_context *>(mctx);
+
+    auto inp = std::make_unique<llm_graph_input_attn_k_dsa>(hparams, cparams, mctx_cur);
+
+    {
+        inp->self_k_idxs_mla = mctx_cur->get_mla()->build_input_k_idxs(ctx0, ubatch);
+
+        inp->self_kq_mask_mla = build_attn_inp_kq_mask(ctx0, mctx_cur->get_mla(), ubatch, cparams);
+        inp->self_kq_mask_mla_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_mla, GGML_TYPE_F16) : inp->self_kq_mask_mla;
+    }
+
+    {
+        inp->self_k_idxs_lid = mctx_cur->get_lid()->build_input_k_idxs(ctx0, ubatch);
+
+        inp->self_kq_mask_lid = build_attn_inp_kq_mask(ctx0, mctx_cur->get_lid(), ubatch, cparams);
+        inp->self_kq_mask_lid_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_lid, GGML_TYPE_F16) : inp->self_kq_mask_lid;
+
+        inp->self_k_rot_lid = mctx_cur->get_lid()->build_input_k_rot(ctx0);
+    }
+
+    return (llm_graph_input_attn_k_dsa *) res->add_input(std::move(inp));
+}
+
 // TODO: maybe separate the inner implementation into a separate function
 //       like with the non-sliding window equivalent
 //       once sliding-window hybrid caches are a thing.
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 9f4816959fe..d07a084a8d6 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -22,6 +22,7 @@ struct llama_layer;
 struct llama_memory_context_i;
 
 class llama_kv_cache_context;
+class llama_kv_cache_dsa_context;
 class llama_kv_cache_iswa_context;
 class llama_memory_recurrent_context;
 class llama_memory_hybrid_context;
@@ -373,6 +374,44 @@ class llm_graph_input_attn_k : public llm_graph_input_i {
     const llama_kv_cache_context * mctx;
 };
 
+class llm_graph_input_attn_k_dsa : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_k_dsa(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_kv_cache_dsa_context * mctx) :
+        hparams(hparams),
+        cparams(cparams),
+        mctx(mctx) {
+    }
+    ~llm_graph_input_attn_k_dsa() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    bool can_reuse(const llm_graph_params & params) override;
+
+    ggml_tensor * get_k_idxs_mla() const { return self_k_idxs_mla; }
+    ggml_tensor * get_k_idxs_lid() const { return self_k_idxs_lid; }
+
+    ggml_tensor * get_kq_mask_mla() const { return self_kq_mask_mla_cnv; }
+    ggml_tensor * get_kq_mask_lid() const { return self_kq_mask_lid; }
+
+    ggml_tensor * self_k_idxs_mla = nullptr; // I64 [n_batch]
+    ggml_tensor * self_k_idxs_lid = nullptr; // I64 [n_batch]
+
+    ggml_tensor * self_kq_mask_mla     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_mla_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_lid     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_lid_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
+
+    ggml_tensor * self_k_rot_lid = nullptr;
+
+    const llama_hparams hparams;
+    const llama_cparams cparams;
+
+    const llama_kv_cache_dsa_context * mctx;
+};
+
 class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
 public:
     llm_graph_input_attn_kv_iswa(
@@ -973,6 +1012,23 @@ struct llm_graph_context {
                   float   kq_scale,
                     int   il) const;
 
+    llm_graph_input_attn_k_dsa * build_attn_inp_k_dsa() const;
+
+    ggml_tensor * build_attn(
+            llm_graph_input_attn_k_dsa * inp,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * wo_s,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+            ggml_tensor * sinks, // [n_head_q]
+            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+            ggml_tensor * top_k, // [n_indexer_top_k, n_tokens]
+                  float   kq_scale,
+                    int   il) const;
+
     llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
 
     // note: if k_cur or v_cur are not provided, they will not be stored in the memory
diff --git a/src/llama-kv-cache-dsa.cpp b/src/llama-kv-cache-dsa.cpp
new file mode 100644
index 00000000000..e44004b5586
--- /dev/null
+++ b/src/llama-kv-cache-dsa.cpp
@@ -0,0 +1,261 @@
+#include "llama-kv-cache-dsa.h"
+
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+
+//
+// llama_kv_cache_dsa
+//
+
+llama_kv_cache_dsa::llama_kv_cache_dsa(
+        const llama_model & model,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                     bool   v_trans,
+                     bool   offload,
+                     bool   unified,
+                 uint32_t   kv_size,
+                 uint32_t   n_seq_max,
+                 uint32_t   n_pad,
+                 uint32_t   n_swa,
+           llama_swa_type   swa_type,
+    const layer_filter_cb & filter,
+    const  layer_reuse_cb & reuse) :
+    hparams_lid(model.hparams), n_stream(unified ? 1 : n_seq_max) {
+
+    LLAMA_LOG_INFO("%s: creating main KV cache, size = %u cells\n", __func__, kv_size);
+
+    kv_mla = std::make_unique<llama_kv_cache>(
+            model, model.hparams, type_k, type_v,
+            v_trans, offload, unified, kv_size, n_seq_max, n_pad,
+            n_swa, swa_type, filter, reuse);
+
+    // we use llama_kv_cache for caching indexer keys
+    // by hand-tweaking some hparams we fool it to create
+    // indexer key cache tensors with correct dimensions
+    // https://github.com/ggml-org/llama.cpp/pull/21149#discussion_r3015940823
+
+    // DSA lightning indexer uses MQA with single key head
+    std::fill(hparams_lid.n_head_kv_arr.begin(), hparams_lid.n_head_kv_arr.end(), 1);
+    hparams_lid.n_embd_head_k_full = model.hparams.indexer_head_size;
+    hparams_lid.rope_type          = LLAMA_ROPE_TYPE_NEOX;
+
+    LLAMA_LOG_INFO("%s: creating indexer KV cache, size = %u cells\n", __func__, kv_size);
+
+    kv_lid = std::make_unique<llama_kv_cache>(
+            model, hparams_lid, type_k, type_v,
+            v_trans, offload, unified, kv_size, n_seq_max, n_pad,
+            n_swa, swa_type, filter, reuse);
+}
+
+void llama_kv_cache_dsa::clear(bool data) {
+    kv_mla->clear(data);
+    kv_lid->clear(data);
+}
+
+bool llama_kv_cache_dsa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    bool res = true;
+
+    res = res & kv_mla->seq_rm(seq_id, p0, p1);
+    res = res & kv_lid->seq_rm(seq_id, p0, p1);
+
+    return res;
+}
+
+void llama_kv_cache_dsa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    kv_mla->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    kv_lid->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_cache_dsa::seq_keep(llama_seq_id seq_id) {
+    kv_mla->seq_keep(seq_id);
+    kv_lid->seq_keep(seq_id);
+}
+
+void llama_kv_cache_dsa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    kv_mla->seq_add(seq_id, p0, p1, shift);
+    kv_lid->seq_add(seq_id, p0, p1, shift);
+}
+
+void llama_kv_cache_dsa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    kv_mla->seq_div(seq_id, p0, p1, d);
+    kv_lid->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_kv_cache_dsa::seq_pos_min(llama_seq_id seq_id) const {
+    return kv_mla->seq_pos_min(seq_id);
+}
+
+llama_pos llama_kv_cache_dsa::seq_pos_max(llama_seq_id seq_id) const {
+    return kv_mla->seq_pos_max(seq_id);
+}
+
+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_dsa::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> mb = kv_mla->memory_breakdown();
+    for (const auto & buft_size : kv_lid->memory_breakdown()) {
+        mb[buft_size.first] += buft_size.second;
+    }
+    return mb;
+}
+
+llama_memory_context_ptr llama_kv_cache_dsa::init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) {
+    GGML_UNUSED(embd_all);
+
+    do {
+        balloc.split_reset();
+
+        std::vector<llama_ubatch> ubatches;
+        while (true) {
+            auto ubatch = n_stream == 1 ? balloc.split_simple(n_ubatch) : balloc.split_equal(n_ubatch, true);
+
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+
+            ubatches.push_back(std::move(ubatch)); // NOLINT
+        }
+
+        if (balloc.get_n_used() < balloc.get_n_tokens()) {
+            // failed to find a suitable split
+            break;
+        }
+
+        auto sinfos_mla = kv_mla->prepare(ubatches);
+        if (sinfos_mla.empty()) {
+            break;
+        }
+
+        auto sinfos_lid = kv_lid->prepare(ubatches);
+        if (sinfos_lid.empty()) {
+            break;
+        }
+
+        assert(sinfos_mla.size() == sinfos_lid.size());
+
+        return std::make_unique<llama_kv_cache_dsa_context>(
+                this, std::move(sinfos_mla), std::move(sinfos_lid), std::move(ubatches));
+    } while (false);
+
+    return std::make_unique<llama_kv_cache_dsa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_context_ptr llama_kv_cache_dsa::init_full() {
+    return std::make_unique<llama_kv_cache_dsa_context>(this);
+}
+
+llama_memory_context_ptr llama_kv_cache_dsa::init_update(llama_context * lctx, bool optimize) {
+    return std::make_unique<llama_kv_cache_dsa_context>(this, lctx, optimize);
+}
+
+bool llama_kv_cache_dsa::get_can_shift() const {
+    return kv_mla->get_can_shift() &&
+           kv_lid->get_can_shift() &&
+           kv_mla->get_size() == kv_lid->get_size();
+}
+
+void llama_kv_cache_dsa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+    kv_mla->state_write(io, seq_id, flags);
+    kv_lid->state_write(io, seq_id, flags);
+}
+
+void llama_kv_cache_dsa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+    kv_mla->state_read(io, seq_id, flags);
+    kv_lid->state_read(io, seq_id, flags);
+}
+
+llama_kv_cache * llama_kv_cache_dsa::get_mla() const {
+    return kv_mla.get();
+}
+
+llama_kv_cache * llama_kv_cache_dsa::get_lid() const {
+    return kv_lid.get();
+}
+
+//
+// llama_kv_cache_dsa_context
+//
+
+llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(llama_memory_status status) : status(status) {}
+
+llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(
+        llama_kv_cache_dsa * kv) :
+    ctx_mla(kv->get_mla()->init_full()),
+    ctx_lid(kv->get_lid()->init_full()),
+    status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) {
+}
+
+llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(
+        llama_kv_cache_dsa * kv,
+        llama_context * lctx,
+        bool optimize) :
+    ctx_mla(kv->get_mla()->init_update(lctx, optimize)),
+    ctx_lid(kv->get_lid()->init_update(lctx, optimize)),
+    status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) {
+}
+
+llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(
+        llama_kv_cache_dsa * kv,
+        slot_info_vec_t sinfos_mla,
+        slot_info_vec_t sinfos_lid,
+        std::vector<llama_ubatch> ubatches) :
+    ubatches(std::move(ubatches)),
+    // note: here we copy the ubatches. not sure if this is ideal
+    ctx_mla(new llama_kv_cache_context(kv->get_mla(), std::move(sinfos_mla), this->ubatches)),
+    ctx_lid(new llama_kv_cache_context(kv->get_lid(), std::move(sinfos_lid), this->ubatches)),
+    status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) {
+}
+
+llama_kv_cache_dsa_context:: ~llama_kv_cache_dsa_context() = default;
+
+bool llama_kv_cache_dsa_context::next() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    ctx_mla->next();
+    ctx_lid->next();
+
+    if (++i_next >= ubatches.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_dsa_context::apply() {
+    assert(!llama_memory_status_is_fail(status));
+
+    bool res = true;
+
+    res = res & ctx_mla->apply();
+    res = res & ctx_lid->apply();
+
+    return res;
+}
+
+llama_memory_status llama_kv_cache_dsa_context::get_status() const {
+    return status;
+}
+
+const llama_ubatch & llama_kv_cache_dsa_context::get_ubatch() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return ubatches[i_next];
+}
+
+const llama_kv_cache_context * llama_kv_cache_dsa_context::get_mla() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return static_cast<const llama_kv_cache_context *>(ctx_mla.get());
+}
+
+const llama_kv_cache_context * llama_kv_cache_dsa_context::get_lid()  const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return static_cast<const llama_kv_cache_context *>(ctx_lid.get());
+}
diff --git a/src/llama-kv-cache-dsa.h b/src/llama-kv-cache-dsa.h
new file mode 100644
index 00000000000..e2b330993b8
--- /dev/null
+++ b/src/llama-kv-cache-dsa.h
@@ -0,0 +1,138 @@
+#pragma once
+
+#include "llama-kv-cache.h"
+
+#include <vector>
+
+//
+// llama_kv_cache_dsa
+//
+
+// utilizes two instances of llama_kv_cache:
+// - the first instance is for caching key tensors of the model,
+// - the second instance is for caching lightning indexer key tensors
+
+class llama_kv_cache_dsa : public llama_memory_i {
+public:
+    llama_kv_cache_dsa(
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   v_trans,
+                         bool   offload,
+                         bool   unified,
+                     uint32_t   kv_size,
+                     uint32_t   n_seq_max,
+                     uint32_t   n_pad,
+                     uint32_t   n_swa,
+               llama_swa_type   swa_type,
+        const layer_filter_cb & filter,
+        const  layer_reuse_cb & reuse);
+
+    ~llama_kv_cache_dsa() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    llama_memory_context_ptr init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) override;
+
+    llama_memory_context_ptr init_full() override;
+
+    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+    bool get_can_shift() const override;
+
+    void clear(bool data) override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
+
+    //
+    // llama_kv_cache_dsa specific API
+    //
+
+    llama_kv_cache * get_mla() const;
+    llama_kv_cache * get_lid() const;
+
+private:
+    // we keep indexer KV cache hparams instance here as llama_kv_cache stores only reference to it
+    llama_hparams hparams_lid;
+    const uint32_t n_stream  = 1;
+
+    std::unique_ptr<llama_kv_cache> kv_mla;
+    std::unique_ptr<llama_kv_cache> kv_lid;
+};
+
+class llama_kv_cache_dsa_context : public llama_memory_context_i {
+public:
+    using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
+
+    // used for errors
+    llama_kv_cache_dsa_context(llama_memory_status status);
+
+    // used to create a full-cache context
+    llama_kv_cache_dsa_context(
+            llama_kv_cache_dsa * kv);
+
+    // used to create an update context
+    llama_kv_cache_dsa_context(
+            llama_kv_cache_dsa * kv,
+            llama_context * lctx,
+            bool optimize);
+
+    // used to create a batch processing context from a batch
+    llama_kv_cache_dsa_context(
+            llama_kv_cache_dsa * kv,
+            slot_info_vec_t sinfos_base,
+            slot_info_vec_t sinfos_ik,
+            std::vector<llama_ubatch> ubatches);
+
+    virtual ~llama_kv_cache_dsa_context();
+
+    //
+    // llama_memory_context_i
+    //
+
+    bool next()  override;
+    bool apply() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_kv_cache_dsa_context specific API
+    //
+
+    const llama_kv_cache_context * get_mla() const;
+    const llama_kv_cache_context * get_lid()  const;
+
+private:
+    //llama_kv_cache_dsa * kv;
+
+    // the index of the next ubatch to process
+    size_t i_next = 0;
+
+    std::vector<llama_ubatch> ubatches;
+
+    const llama_memory_context_ptr ctx_mla;
+    const llama_memory_context_ptr ctx_lid;
+
+    const llama_memory_status status;
+};
diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp
index 26e2cb4270b..9b9f1790363 100644
--- a/src/llama-kv-cache-iswa.cpp
+++ b/src/llama-kv-cache-iswa.cpp
@@ -60,14 +60,14 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
     LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
 
     kv_base = std::make_unique<llama_kv_cache>(
-            model, type_k, type_v,
+            model, hparams, type_k, type_v,
             v_trans, offload, unified, size_base, n_seq_max, n_pad,
             0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
 
     LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
 
     kv_swa = std::make_unique<llama_kv_cache>(
-            model, type_k, type_v,
+            model, hparams, type_k, type_v,
             v_trans, offload, unified, size_swa, n_seq_max, n_pad,
             hparams.n_swa, hparams.swa_type, filter_swa, reuse);
 }
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 2356d612b1b..ac11f96c22d 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -79,6 +79,7 @@ static ggml_tensor * ggml_mul_mat_aux(
 
 llama_kv_cache::llama_kv_cache(
         const llama_model & model,
+        const llama_hparams & hparams,
                 ggml_type   type_k,
                 ggml_type   type_v,
                      bool   v_trans,
@@ -91,7 +92,7 @@ llama_kv_cache::llama_kv_cache(
            llama_swa_type   swa_type,
     const layer_filter_cb & filter,
     const  layer_reuse_cb & reuse) :
-    model(model), hparams(model.hparams), v_trans(v_trans),
+    model(model), hparams(hparams), v_trans(v_trans),
     n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
 
     GGML_ASSERT(kv_size % n_pad == 0);
@@ -253,7 +254,7 @@ llama_kv_cache::llama_kv_cache(
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
     for (auto & [buft, ctx] : ctx_map) {
         ggml_backend_buffer_t buf;
-        if (model.hparams.no_alloc) {
+        if (hparams.no_alloc) {
             buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
             for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
                 t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
@@ -293,6 +294,11 @@ llama_kv_cache::llama_kv_cache(
         ggml_is_quantized(type_k) &&
         hparams.n_embd_head_k() % 64 == 0;
 
+    // always create Hadamard rotation tensors for DeepSeek V3.2 DSA lightning indexer
+    if (model.arch == LLM_ARCH_DEEPSEEK32 && hparams.n_embd_head_k_full == hparams.indexer_head_size) {
+        attn_rot_k = true;
+    }
+
     attn_rot_v =
         !attn_rot_disable &&
         n_embd_head_v_all > 0 &&
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 0b62dc7b232..649269af6dd 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -93,8 +93,12 @@ class llama_kv_cache : public llama_memory_i {
 
     using slot_info_vec_t = std::vector<slot_info>;
 
+    // TODO: refactor the memory instances to not depend on `llama_model`
+    //       instead pass all necessary info (e.g. hparams, dev layers, arch, etc.) directly
+    //       likely through `struct llama_memory_params`
     llama_kv_cache(
             const llama_model & model,
+            const llama_hparams & hparams,
                     ggml_type   type_k,
                     ggml_type   type_v,
                          bool   v_trans,
diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp
index 33b3b395e0c..6bd2ec18ce3 100644
--- a/src/llama-memory-hybrid.cpp
+++ b/src/llama-memory-hybrid.cpp
@@ -33,6 +33,7 @@ llama_memory_hybrid::llama_memory_hybrid(
     hparams(model.hparams),
     mem_attn(new llama_kv_cache(
         model,
+        model.hparams,
         type_k,
         type_v,
         v_trans,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 0c3e03a61dc..a8323c8fb1e 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -10,6 +10,7 @@
 
 #include "llama-kv-cache.h"
 #include "llama-kv-cache-iswa.h"
+#include "llama-kv-cache-dsa.h"
 #include "llama-memory-hybrid.h"
 #include "llama-memory-hybrid-iswa.h"
 #include "llama-memory-recurrent.h"
@@ -172,6 +173,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
             return new llama_model_deepseek2(params);
         case LLM_ARCH_DEEPSEEK2OCR:
             return new llama_model_deepseek2ocr(params);
+        case LLM_ARCH_DEEPSEEK32:
+            return new llama_model_deepseek32(params);
         case LLM_ARCH_GLM_DSA:
             return new llama_model_glm_dsa(params);
         case LLM_ARCH_MISTRAL4:
@@ -779,6 +782,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_310B_A15B:     return "310B.A15B";
         case LLM_TYPE_355B_A32B:     return "355B.A32B";
         case LLM_TYPE_397B_A17B:     return "397B.A17B";
+        case LLM_TYPE_685B_A37B:     return "685B.A37B";
         case LLM_TYPE_744B_A40B:     return "744B.A40B";
         case LLM_TYPE_E2B:           return "E2B";
         case LLM_TYPE_E4B:           return "E4B";
@@ -1769,7 +1773,7 @@ void llama_model::print_info() const {
             LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
         }
 
-        if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
+        if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_DEEPSEEK32 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
             LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
             LLAMA_LOG_INFO("%s: n_lora_q              = %d\n",     __func__, hparams.n_lora_q);
             LLAMA_LOG_INFO("%s: n_lora_kv             = %d\n",     __func__, hparams.n_lora_kv);
@@ -1957,6 +1961,23 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
             {
                 res = nullptr;
             } break;
+        case LLM_ARCH_DEEPSEEK32:
+            {
+                res = new llama_kv_cache_dsa(
+                        *this,
+                        params.type_k,
+                        params.type_v,
+                        !cparams.flash_attn,
+                        cparams.offload_kqv,
+                        cparams.kv_unified,
+                        cparams.n_ctx_seq,
+                        cparams.n_seq_max,
+                        1,
+                        hparams.n_swa,
+                        hparams.swa_type,
+                        nullptr,
+                        nullptr);
+            } break;
         // Models that need standard caching should rely on recurrent/hybrid
         // checks
         default:
@@ -2083,6 +2104,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
 
                         res = new llama_kv_cache(
                                 *this,
+                                hparams,
                                 params.type_k,
                                 params.type_v,
                                 !cparams.flash_attn,
@@ -2272,6 +2294,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_DEEPSEEK:
         case LLM_ARCH_DEEPSEEK2:
         case LLM_ARCH_DEEPSEEK2OCR:
+        case LLM_ARCH_DEEPSEEK32:
         case LLM_ARCH_PLM:
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GRANITE:
diff --git a/src/llama-model.h b/src/llama-model.h
index b797b8966ac..743feb970d9 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -137,6 +137,7 @@ enum llm_type {
     LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
     LLM_TYPE_355B_A32B, // GLM-4.5
     LLM_TYPE_397B_A17B, // Qwen3.5
+    LLM_TYPE_685B_A37B, // DeepSeek V3.2
     LLM_TYPE_744B_A40B, // GLM-5
     LLM_TYPE_E2B,
     LLM_TYPE_E4B,
diff --git a/src/models/deepseek32.cpp b/src/models/deepseek32.cpp
new file mode 100644
index 00000000000..c92ab60d166
--- /dev/null
+++ b/src/models/deepseek32.cpp
@@ -0,0 +1,503 @@
+#include "models.h"
+
+#include "llama-kv-cache.h"
+#include "llama-kv-cache-dsa.h"
+
+void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
+    hparams.f_norm_eps = 1e-6;  // eps for layer norm
+    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+    // MoE parameters
+    ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
+    ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+
+    // deepseek MLA parameters
+    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,      hparams.n_lora_q);
+    ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
+    ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla_impl, false);
+    ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
+
+    // DSA parameters
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K,      hparams.indexer_top_k);
+
+    // Expert gating function
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
+
+    if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
+        // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+        // cancel the factor from the convert script
+        hparams.rope_yarn_log_mul /= 0.1f;
+    }
+
+    // NextN/MTP parameters
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
+    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+
+    // TODO: when MTP is implemented, this should probably be updated if needed
+    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
+    switch (hparams.n_layer) {
+        case 62: type = LLM_TYPE_685B_A37B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+    const bool is_mla = hparams.is_mla();
+    if (!is_mla) {
+        throw std::runtime_error("DEEPSEEK32 architecture requires MLA");
+    }
+
+    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
+    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
+
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
+
+    const int64_t q_lora_rank  = hparams.n_lora_q;
+    const int64_t kv_lora_rank = hparams.n_lora_kv;
+
+    const int64_t n_ff_exp        = hparams.n_ff_exp;
+    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    // try to load output.weight, if not found, use token_embd (tied embeddings)
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    if (!output) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        int flags = 0;
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            // skip all tensors in the NextN layers
+            // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
+            flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
+        }
+
+        auto & layer = layers[i];
+
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
+        layer.attn_q_a_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags);
+        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags);
+
+        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags);
+        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags);
+
+        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags);
+
+        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
+        layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags);
+        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags);
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
+
+        // DSA indexer
+        layer.indexer_k_norm   = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "weight", i), {hparams.indexer_head_size}, flags);
+        layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "bias",   i), {hparams.indexer_head_size}, flags);
+        layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, flags);
+        layer.indexer_attn_k   = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K,   "weight", i), {n_embd, hparams.indexer_head_size}, flags);
+        layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
+        if (i < (int) hparams.n_layer_dense_lead) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
+        } else {
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+            if (n_expert == 0) {
+                throw std::runtime_error("n_expert must be > 0");
+            }
+            if (n_expert_used == 0) {
+                throw std::runtime_error("n_expert_used must be > 0");
+            }
+
+            // MoE branch
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
+
+            // Shared expert branch
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, flags);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
+        }
+
+        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
+            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
+            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
+
+            // Optional tensors
+            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_deepseek32::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_params & params) :
+    llm_graph_context(params) {
+    const bool is_mla = hparams.is_mla();
+    GGML_ASSERT(is_mla);
+
+    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+    const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
+    const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
+    GGML_UNUSED(n_embd_head_v);
+
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+    const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
+
+    const int64_t n_indexer_head = hparams.indexer_n_head;
+    const int64_t n_embd_indexer_head = hparams.indexer_head_size;
+    const int64_t n_embd_indexer_head_rope = hparams.n_rot();
+    const int64_t n_embd_indexer_head_nope = n_embd_indexer_head - n_embd_indexer_head_rope;
+    const uint32_t n_indexer_top_k = hparams.indexer_top_k;
+
+    const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+    // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
+    // See https://github.com/ggml-org/llama.cpp/discussions/7416 for detailed explanation.
+    // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+
+    // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
+    GGML_ASSERT(ext_factor >= 0.0f);
+    const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
+
+    // use the original attn_factor to pre-scale the kq_scale
+    const float mscale   = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+    const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    // {n_embd, n_tokens}
+    inpL = build_inp_embd(model.tok_embd);
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    llm_graph_input_attn_k_dsa * inp_attn_dsa = build_attn_inp_k_dsa();
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers;
+    for (int il = 0; il < effective_n_layers; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        // norm
+        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // self_attention
+        {
+            ggml_tensor * qr = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+            cb(qr, "qr", il);
+
+            qr = build_norm(qr, model.layers[il].attn_q_a_norm, nullptr, LLM_NORM_RMS, il);
+            cb(qr, "qr", il);
+
+            ggml_tensor * top_k = nullptr;
+
+            // lightning indexer
+            {
+                ggml_tensor * indexer_q = ggml_mul_mat(ctx0, model.layers[il].indexer_attn_q_b, qr);
+                cb(indexer_q, "indexer_q", il);
+
+                // split into {n_embd_indexer_head_rope, n_indexer_head, n_tokens}
+                ggml_tensor * indexer_q_pe =
+                    ggml_view_3d(ctx0, indexer_q, n_embd_indexer_head_rope, n_indexer_head, n_tokens,
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head),
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head) * n_indexer_head, 0);
+                cb(indexer_q_pe, "indexer_q_pe", il);
+
+                // and {n_embd_indexer_head_nope, n_indexer_head, n_tokens}
+                ggml_tensor * indexer_q_nope =
+                    ggml_view_3d(ctx0, indexer_q, n_embd_indexer_head_nope, n_indexer_head, n_tokens,
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head),
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head) * n_indexer_head,
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head_nope));
+                cb(indexer_q_nope, "indexer_q_nope", il);
+
+                indexer_q_pe = ggml_rope_ext(ctx0, indexer_q_pe, inp_pos, nullptr, n_rot,
+                                     LLAMA_ROPE_TYPE_NEOX, n_ctx_orig, freq_base, freq_scale,
+                                     ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(indexer_q_pe, "indexer_q_pe", il);
+
+                // {n_embd_indexer_head_rope + n_embd_indexer_head_nope, n_head, n_tokens}
+                indexer_q = ggml_concat(ctx0, indexer_q_pe, indexer_q_nope, 0);
+                cb(indexer_q, "indexer_q", il);
+
+                ggml_tensor * indexer_k = ggml_mul_mat(ctx0, model.layers[il].indexer_attn_k, cur);
+                cb(indexer_k, "indexer_k", il);
+
+                indexer_k = build_norm(indexer_k, model.layers[il].indexer_k_norm, model.layers[il].indexer_k_norm_b, LLM_NORM, il);
+                cb(indexer_k, "indexer_k", il);
+
+                // split into {n_embd_indexer_head_rope, 1, n_tokens}
+                ggml_tensor * indexer_k_pe =
+                    ggml_view_3d(ctx0, indexer_k, n_embd_indexer_head_rope, 1, n_tokens,
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head),
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head) * 1, 0);
+                cb(indexer_k_pe, "indexer_k_pe", il);
+
+                // and {n_embd_indexer_head_nope, 1, n_tokens}
+                ggml_tensor * indexer_k_nope =
+                    ggml_view_3d(ctx0, indexer_k, n_embd_indexer_head_nope, 1, n_tokens,
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head),
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head) * 1,
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head_nope));
+                cb(indexer_k_nope, "indexer_k_nope", il);
+
+                indexer_k_pe = ggml_rope_ext(ctx0, indexer_k_pe, inp_pos, nullptr, n_rot,
+                                     LLAMA_ROPE_TYPE_NEOX, n_ctx_orig, freq_base, freq_scale,
+                                     ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(indexer_k_pe, "indexer_k_pe", il);
+
+                // {n_embd_indexer_head_rope + n_embd_indexer_head_nope, 1, n_tokens}
+                indexer_k = ggml_concat(ctx0, indexer_k_pe, indexer_k_nope, 0);
+                cb(indexer_k, "indexer_k", il);
+
+                // perform Hadamard transform on indexer q and k
+                indexer_q = ggml_mul_mat(ctx0, inp_attn_dsa->self_k_rot_lid, indexer_q);
+                cb(indexer_q, "indexer_q", il);
+                indexer_k = ggml_mul_mat(ctx0, inp_attn_dsa->self_k_rot_lid, indexer_k);
+                cb(indexer_k, "indexer_k", il);
+
+                // store indexer keys to KV cache
+                const auto * mctx_lid = inp_attn_dsa->mctx->get_lid();
+                const auto & k_idxs_lid = inp_attn_dsa->get_k_idxs_lid();
+                ggml_build_forward_expand(gf, mctx_lid->cpy_k(ctx0, indexer_k, k_idxs_lid, il));
+
+                // prepare indexer weights
+                ggml_tensor * indexer_weights = ggml_mul_mat(ctx0, model.layers[il].indexer_proj, cur);
+                cb(indexer_weights, "indexer_weights", il);
+
+                // get cached indexer keys
+                indexer_k = mctx_lid->get_k(ctx0, il);
+
+                // split the batch into streams if needed
+                const auto n_stream = indexer_k->ne[3];
+                indexer_q = ggml_view_4d(ctx0, indexer_q, indexer_q->ne[0], indexer_q->ne[1], indexer_q->ne[2]/n_stream, n_stream, indexer_q->nb[1], indexer_q->nb[2], indexer_q->nb[3]/n_stream, 0);
+                indexer_weights = ggml_view_4d(ctx0, indexer_weights, indexer_weights->ne[0], indexer_weights->ne[1]/n_stream, indexer_weights->ne[2], n_stream, indexer_weights->nb[1], indexer_weights->nb[2]/n_stream, indexer_weights->nb[3]/n_stream, 0);
+
+                // calculate indexer kq
+                indexer_q = ggml_permute(ctx0, indexer_q, 0, 2, 1, 3);
+                cb(indexer_q, "indexer_q", il);
+                indexer_k = ggml_permute(ctx0, indexer_k, 0, 2, 1, 3);
+                cb(indexer_k, "indexer_k", il);
+
+                ggml_tensor * indexer_kq = ggml_mul_mat(ctx0, indexer_k, indexer_q);
+                cb(indexer_kq, "indexer_kq", il);
+
+                // ReLU requires contiguous tensors
+                indexer_kq = ggml_cont(ctx0, ggml_permute(ctx0, indexer_kq, 2, 1, 0, 3));
+                cb(indexer_kq, "indexer_kq", il);
+
+                // apply ReLU
+                ggml_tensor * indexer_score = ggml_relu(ctx0, indexer_kq);
+                cb(indexer_score, "indexer_score", il);
+
+                // pre-scale weights to avoid scaling operations on huge indexer_score tensor
+                indexer_weights = ggml_scale(ctx0, indexer_weights, 1.0f / sqrtf(float(n_embd_indexer_head * n_indexer_head)));
+                cb(indexer_weights, "indexer_weights", il);
+
+                // multiply scores by indexer weights
+                indexer_score = ggml_mul(ctx0, indexer_score, indexer_weights);
+                cb(indexer_score, "indexer_score", il);
+
+                // sum by q n_indexer_head dimension
+                indexer_score = ggml_sum_rows(ctx0, indexer_score);
+                cb(indexer_score, "indexer_score", il);
+
+                // permute result to match KQ mask
+                indexer_score = ggml_cont(ctx0, ggml_permute(ctx0, indexer_score, 2, 1, 0, 3));
+                cb(indexer_score, "indexer_score", il);
+
+                // mask indexer scores
+                ggml_tensor * indexer_kq_mask = inp_attn_dsa->get_kq_mask_lid();
+                indexer_score = ggml_add(ctx0, indexer_score, indexer_kq_mask);
+                cb(indexer_score, "indexer_score", il);
+
+                // get indices of top k indexer scores
+                uint32_t n_top_k = indexer_score->ne[0] < n_indexer_top_k ? indexer_score->ne[0] : n_indexer_top_k;
+                top_k = ggml_cont(ctx0, ggml_top_k(ctx0, indexer_score, n_top_k));
+                cb(top_k, "top_k", il);
+            }
+
+            ggml_tensor * q = ggml_mul_mat(ctx0, model.layers[il].wq_b, qr);
+            cb(q, "q", il);
+
+            // split into {n_embd_head_qk_nope, n_head, n_tokens}
+            ggml_tensor * q_nope =
+                ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
+                             ggml_row_size(q->type, n_embd_head_k) * n_head, 0);
+            cb(q_nope, "q_nope", il);
+
+            // and {n_embd_head_qk_rope, n_head, n_tokens}
+            ggml_tensor * q_pe = ggml_view_3d(
+                ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
+                ggml_row_size(q->type, n_embd_head_k) * n_head, ggml_row_size(q->type, n_embd_head_qk_nope));
+            cb(q_pe, "q_pe", il);
+
+            ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+            cb(kv_cmpr_pe, "kv_cmpr_pe", il);
+
+            // split into {kv_lora_rank, n_tokens}
+            ggml_tensor * kv_cmpr =
+                ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens,
+                             ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+            cb(kv_cmpr, "kv_cmpr", il);
+
+            // and {n_embd_head_qk_rope, 1, n_tokens}
+            ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens,
+                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
+            cb(k_pe, "k_pe", il);
+
+            q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                 ext_factor, attn_factor, beta_fast, beta_slow);
+            cb(q_pe, "q_pe", il);
+
+            k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                 ext_factor, attn_factor, beta_fast, beta_slow);
+            cb(k_pe, "k_pe", il);
+
+            kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
+            cb(kv_cmpr, "kv_cmpr", il);
+
+            // MLA attention
+            {
+                // {n_embd_head_qk_nope, n_tokens, n_head}
+                q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
+                cb(q_nope, "q_nope_perm", il);
+
+                // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
+                ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
+                cb(q_nope_absorbed, "q_nope_absorbed", il);
+
+                // {kv_lora_rank, n_head, n_tokens}
+                q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
+                cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
+
+                // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
+                // note: rope must go first for in-place context shifting in build_rope_shift()
+                ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
+                cb(Qcur, "Qcur", il);
+
+                kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
+                cb(kv_cmpr, "kv_cmpr_reshape", il);
+
+                // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
+                ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
+                cb(Kcur, "Kcur", il);
+
+                // {kv_lora_rank, 1, n_tokens}
+                ggml_tensor * Vcur = kv_cmpr;
+                cb(Vcur, "Vcur", il);
+
+                // note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
+                cur = build_attn(inp_attn_dsa,
+                        model.layers[il].wo, NULL, model.layers[il].wo_s,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, top_k, kq_scale, il);
+            }
+        }
+        if (il == effective_n_layers - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+
+        if ((uint32_t) il < hparams.n_layer_dense_lead) {
+            cur = build_ffn(cur,
+                model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
+                model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
+                model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s,
+                NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        } else {
+            // MoE branch
+            ggml_tensor * moe_out = build_moe_ffn(cur,
+                model.layers[il].ffn_gate_inp,
+                model.layers[il].ffn_up_exps,
+                model.layers[il].ffn_gate_exps,
+                model.layers[il].ffn_down_exps,
+                model.layers[il].ffn_exp_probs_b,
+                n_expert, n_expert_used,
+                LLM_FFN_SILU, hparams.expert_weights_norm,
+                hparams.expert_weights_scale,
+                (llama_expert_gating_func_type) hparams.expert_gating_func,
+                il,
+                nullptr,
+                model.layers[il].ffn_gate_up_exps,
+                model.layers[il].ffn_up_exps_s,
+                model.layers[il].ffn_gate_exps_s,
+                model.layers[il].ffn_down_exps_s);
+            cb(moe_out, "ffn_moe_out", il);
+
+            // FFN shared expert
+            {
+                ggml_tensor * ffn_shexp =
+                    build_ffn(cur,
+                        model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s,
+                        model.layers[il].ffn_gate_shexp, NULL, model.layers[il].ffn_gate_shexp_s,
+                        model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s,
+                        NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(ffn_shexp, "ffn_shexp", il);
+
+                cur = ggml_add(ctx0, moe_out, ffn_shexp);
+                cb(cur, "ffn_out", il);
+            }
+        }
+        cur = ggml_add(ctx0, cur, ffn_inp);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+
+    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // lm_head
+    cur = ggml_mul_mat(ctx0, model.output, cur);
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/src/models/models.h b/src/models/models.h
index db228865d5d..5251e2d8280 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -1030,6 +1030,19 @@ struct llama_model_deepseek2 : public llama_model_base {
 };
 
 
+struct llama_model_deepseek32 : public llama_model_base {
+    llama_model_deepseek32(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
 struct llama_model_deepseek2ocr : public llama_model_base {
     llama_model_deepseek2ocr(const struct llama_model_params & params) : llama_model_base(params) {}
     void load_arch_hparams(llama_model_loader & ml) override;
diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp
index 25e29638e97..3714eaedb0b 100644
--- a/tests/test-llama-archs.cpp
+++ b/tests/test-llama-archs.cpp
@@ -100,6 +100,7 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
         n_ff   = 96;
         n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded
     } else if (arch == LLM_ARCH_DEEPSEEK2
+            || arch == LLM_ARCH_DEEPSEEK32
             || arch == LLM_ARCH_GLM_DSA
             || arch == LLM_ARCH_KIMI_LINEAR
             || arch == LLM_ARCH_MISTRAL4) {
@@ -156,6 +157,7 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
 
     ms.add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, 8.0f);
     if (arch == LLM_ARCH_DEEPSEEK2
+            || arch == LLM_ARCH_DEEPSEEK32
             || arch == LLM_ARCH_GLM_DSA
             || arch == LLM_ARCH_KIMI_LINEAR
             || arch == LLM_ARCH_MISTRAL4) {
@@ -332,6 +334,7 @@ static bool moe_mandatory(const llm_arch arch) {
         case LLM_ARCH_ARCTIC:
         case LLM_ARCH_DEEPSEEK:
         case LLM_ARCH_DEEPSEEK2:
+        case LLM_ARCH_DEEPSEEK32:
         case LLM_ARCH_GLM4_MOE:
         case LLM_ARCH_GLM_DSA:
         case LLM_ARCH_EXAONE_MOE:

From cb47092b007fcd5122eee2e8bb32ce972cdb23c2 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Fri, 29 May 2026 10:23:17 +0200
Subject: [PATCH 28/50] server: bump timeout to 3600s (#23842)

* server: bump timeout to 3600s

* nits: change wording
---
 common/common.h               | 2 +-
 tools/server/server-queue.cpp | 6 ++++--
 tools/server/server-queue.h   | 2 ++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/common/common.h b/common/common.h
index 8a0e5eed5ee..9855d3f3694 100644
--- a/common/common.h
+++ b/common/common.h
@@ -587,7 +587,7 @@ struct common_params {
     // server params
     int32_t port                = 8080;          // server listens on this network port
     bool    reuse_port          = false;         // allow multiple sockets to bind to the same port
-    int32_t timeout_read        = 600;           // http read timeout in seconds
+    int32_t timeout_read        = 3600;          // http read timeout in seconds
     int32_t timeout_write       = timeout_read;  // http write timeout in seconds
     int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
     int32_t n_cache_reuse       = 0;     // min chunk size to reuse from the cache via KV shifting
diff --git a/tools/server/server-queue.cpp b/tools/server/server-queue.cpp
index d5fceb1b131..588e1a82b18 100644
--- a/tools/server/server-queue.cpp
+++ b/tools/server/server-queue.cpp
@@ -381,8 +381,10 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
         if (result == nullptr) {
             // timeout, check stop condition
             if (should_stop()) {
-                SRV_WRN("%s", "stopping wait for next result due to should_stop condition (adjust the --timeout argument if needed)\n");
-                SRV_WRN("%s", "ref: https://github.com/ggml-org/llama.cpp/pull/22907\n");
+                const int64_t time_elapsed_ms = ggml_time_ms() - time_start_ms;
+                if (time_elapsed_ms > 30000) {
+                    SRV_WRN("%s", "request cancelled after 30s, potentially a client-side timeout; please check your client's code\n");
+                }
                 return nullptr;
             }
         } else {
diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h
index 35f010401fc..8ce32c69fb0 100644
--- a/tools/server/server-queue.h
+++ b/tools/server/server-queue.h
@@ -169,6 +169,8 @@ struct server_response_reader {
     bool cancelled = false;
     int polling_interval_seconds;
 
+    const int64_t time_start_ms = ggml_time_ms();
+
     // tracking generation state and partial tool calls
     // only used by streaming completions
     std::vector<task_result_state> states;

From 6ed481eea4cf4ed40777db2fa29e8d08eb712b3b Mon Sep 17 00:00:00 2001
From: Oliver Simons <osimons@nvidia.com>
Date: Fri, 29 May 2026 12:28:18 +0200
Subject: [PATCH 29/50] CUDA: Check PTX version on host side to guard PDL
 dispatch (#23530)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* CUDA: Check PTX version on host side to guard PDL dispatch

Checking on `__CUDA_ARCH_LIST__` alone is insufficient for JIT, as this
variable doesn't differentiate between compiling for say sm_90, sm_90a
or sm_90f (so forward-jittable PTX vs. arch/family-specific PTX).

Thus, one can have a bug when compiling with
`DCMAKE_CUDA_ARCHITECTURES="89;90a"`, where current code would wrongly
dispatch to PDL on sm_90/sm_120 in forward-JIT mode.

This PR fixes this issue by checking `cudaFuncAttributes::ptxVersion` of
the incoming kernel at runtime. A check on ptxVersion alone is
sufficient, as device-codes will always be >= ptxVersion (and any
violation of this would be a severe bug in CUDA/nvcc), see:
 https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-code-code-code

* Implement MurmurHash3 mixer for better hash distribution

Magic constants were taken from boost:
https://github.com/boostorg/container_hash/blob/2698b43803c012601e6bb1a6116e83767b97986c/include/boost/container_hash/detail/hash_mix.hpp#L19-L65

* Update ggml/src/ggml-cuda/common.cuh

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Address review comments, make seed non-zero

* Apply code-formatting

* Replace std::size_t -> size_t for consistency

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/common.cuh | 60 +++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 50d7763dcdd..560fab0b17b 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -7,6 +7,7 @@
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
+#include <mutex>
 
 #if defined(GGML_USE_HIP)
 #define GGML_COMMON_DECL_HIP
@@ -1552,6 +1553,62 @@ struct ggml_cuda_pdl_config {
     ggml_cuda_pdl_config& operator=(ggml_cuda_pdl_config&&) = delete;
 
 };
+
+static bool ggml_cuda_kernel_can_use_pdl(const void * kernel) {
+    const int device = ggml_cuda_get_device();
+
+    struct cache_key {
+        int          device;
+        const void * kernel;
+
+        bool operator==(const cache_key & other) const { return device == other.device && kernel == other.kernel; }
+    };
+
+    struct cache_key_hash {
+        // MurmurHash3 mixing function for better hash distribution (vs. just std::hash which in some implementations simply returns the identity)
+        static size_t hash_mix(size_t x) {
+            std::uint64_t       y = x;
+            const std::uint64_t m = 0xe9846af9b1a615d;
+
+            y ^= y >> 32;
+            y *= m;
+            y ^= y >> 32;
+            y *= m;
+            y ^= y >> 28;
+
+            return static_cast<size_t>(y);
+        }
+
+        size_t operator()(const cache_key & key) const {
+            // Use a nonzero seed to avoid mapping all-zero keys to zero
+            size_t h = 42;
+            h        = hash_mix(h + key.device);
+            h        = hash_mix(h + reinterpret_cast<size_t>(key.kernel));
+            return h;
+        }
+    };
+
+    static std::mutex                                          cache_mutex;
+    static std::unordered_map<cache_key, bool, cache_key_hash> cache;
+
+    const cache_key             key = { device, kernel };
+    std::lock_guard<std::mutex> lock(cache_mutex);
+    const auto                  it = cache.find(key);
+    if (it != cache.end()) {
+        return it->second;
+    }
+
+    cudaFuncAttributes attr = {};
+    CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel));
+
+    // PDL device-side primitives are emitted only for PTX versions >= 90.
+    // We have to guard on a loaded kernel's PTX version so a kernel forward-JIT'ed
+    // from pre-Hopper PTX to a Hopper-or-newer GPU does not opt into PDL.
+    const bool can_use_pdl = attr.ptxVersion >= 90;
+    cache.emplace(key, can_use_pdl);
+    return can_use_pdl;
+}
+
 #endif //defined(GGML_CUDA_USE_PDL)
 
 
@@ -1564,8 +1621,7 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke
         return env == nullptr || std::atoi(env) != 0;
     }();
 
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) {
+    if (env_pdl_enabled && ggml_cuda_kernel_can_use_pdl(reinterpret_cast<const void *>(kernel))) {
         auto pdl_cfg = ggml_cuda_pdl_config(launch_params);
 
         CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward<Args>(args)... ));

From da3f990a47ec8c25ff3d2154d3dea46ee3f4f334 Mon Sep 17 00:00:00 2001
From: Saba Fallah <10401143+sfallah@users.noreply.github.com>
Date: Fri, 29 May 2026 16:13:51 +0200
Subject: [PATCH 30/50] mtmd: Add DeepSeekOCR 2 Support (#20975)

* mtmd: DeepSeek-OCR 2 support, with multi-tile dynamic resolution

* introduced clip_image_f32::add_viewsep

* address PR review

- drop redundant ggml_cpy ops in both deepseekocr versions build
- drop no-op ggml_cont in build_sam
- assert num_image_tokens deepseekocr2
- view_seperator as (1, n_embd) at conversion (for both versions)
- drop redundant ggml_reshape_2d

* Update tools/mtmd/models/deepseekocr2.cpp

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>

---------

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
---
 conversion/__init__.py                |   1 +
 conversion/base.py                    |   2 +-
 conversion/deepseek.py                |  68 +++++++--
 gguf-py/gguf/constants.py             |   7 +
 gguf-py/gguf/tensor_mapping.py        |  18 +++
 tools/mtmd/CMakeLists.txt             |   1 +
 tools/mtmd/clip-impl.h                |   7 +
 tools/mtmd/clip-model.h               |   5 +
 tools/mtmd/clip.cpp                   |  55 +++++++-
 tools/mtmd/models/deepseekocr.cpp     |  17 +--
 tools/mtmd/models/deepseekocr2.cpp    |  81 +++++++++++
 tools/mtmd/models/models.h            |   5 +
 tools/mtmd/mtmd-image.cpp             |  99 +++++++++++++
 tools/mtmd/mtmd-image.h               |  20 +++
 tools/mtmd/mtmd.cpp                   |  14 +-
 tools/mtmd/tests/test-deepseek-ocr.py | 193 ++++++++++++++++++--------
 16 files changed, 504 insertions(+), 89 deletions(-)
 create mode 100644 tools/mtmd/models/deepseekocr2.cpp

diff --git a/conversion/__init__.py b/conversion/__init__.py
index 89e05d691d4..cfaa24ba1a1 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -237,6 +237,7 @@
 MMPROJ_MODEL_MAP: dict[str, str] = {
     "AudioFlamingo3ForConditionalGeneration": "ultravox",
     "CogVLMForCausalLM": "cogvlm",
+    "DeepseekOCR2ForCausalLM": "deepseek",
     "DeepseekOCRForCausalLM": "deepseek",
     "DotsOCRForCausalLM": "dotsocr",
     "Gemma3ForConditionalGeneration": "gemma",
diff --git a/conversion/base.py b/conversion/base.py
index f861f8b5296..44b2c964f4b 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -1140,7 +1140,7 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
         # Skip multimodal tensors
         if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \
                 or "visual." in name or "vision." in name or "audio." in name or "talker." in name \
-                or "vision_" in name or "audio_" in name or "sam_model" in name \
+                or "vision_" in name or "audio_" in name \
                 or "token2wav." in name or "code2wav." in name \
                 or "projector." in name or "pre_mm_projector_norm" in name \
                 or "image_newline" in name or "view_seperator" in name \
diff --git a/conversion/deepseek.py b/conversion/deepseek.py
index af18a25a8b5..72520cc9f6a 100644
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@@ -16,10 +16,14 @@
 
 @ModelBase.register("DeepseekOCRForCausalLM")
 class DeepseekOCRVisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
+        self.gguf_writer.add_clip_projector_type(self.clip_projector_type)
         # default values below are taken from HF tranformers code
         self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
         self.gguf_writer.add_vision_use_gelu(True)
@@ -49,22 +53,27 @@ def get_vision_config(self) -> dict[str, Any]:
             raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")
 
         vision_config['sam'] = vision_config['width']['sam_vit_b']
-        vision_config.update(vision_config['width']['clip-l-14-224'])
-        vision_config['hidden_size'] = vision_config['width']
-        vision_config['num_heads'] = vision_config['heads']
-        vision_config['intermediate_size'] = vision_config['heads'] * 4
+        if vision_config['width'].get('clip-l-14-224') is not None:
+            vision_config.update(vision_config['width']['clip-l-14-224'])
+        if isinstance(vision_config['width'], int):
+            vision_config['hidden_size'] = vision_config['width']
+        if vision_config.get('heads') is not None:
+            vision_config['num_heads'] = vision_config['heads']
+            vision_config['intermediate_size'] = vision_config['heads'] * 4
 
         return vision_config
 
     def tensor_force_quant(self, name, new_name, bid, n_dims):
-        if ".embeddings." in name or 'pos_embed' in name:
-            return gguf.GGMLQuantizationType.F32
-        if ".rel_pos_h" in name or '.rel_pos_w' in name:
-            return gguf.GGMLQuantizationType.F32
-        if ".neck." in name or ".net_" in name:
-            return gguf.GGMLQuantizationType.F32
+        for nq_name in ('.embeddings.', 'pos_embed', '.rel_pos_h', '.rel_pos_w', '.neck.', '.net_'):
+            if nq_name in name:
+                return gguf.GGMLQuantizationType.F32
         return super().tensor_force_quant(name, new_name, bid, n_dims)
 
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.endswith("view_seperator"):
+            data_torch = data_torch.unsqueeze(0)
+        yield from super().modify_tensors(data_torch, name, bid)
+
     @classmethod
     def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
         name, gen = item
@@ -81,6 +90,33 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
         return super().filter_tensors((name, gen))
 
 
+@ModelBase.register("DeepseekOCR2ForCausalLM")
+class DeepseekOCR2VisionModel(DeepseekOCRVisionModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR2
+
+    def set_gguf_parameters(self):
+        # the vision tower's qwen2 encoder is built from fixed defaults,
+        # see build_qwen2_decoder_as_encoder() in deepencoderv2.py
+        if self.hparams.get("patch_size") is None:
+            self.hparams["patch_size"] = 16
+        if self.hparams.get("intermediate_size") is None:
+            self.hparams["intermediate_size"] = 4864
+        if self.hparams.get("num_attention_heads") is None:
+            self.hparams["num_attention_heads"] = 14
+        super().set_gguf_parameters()
+        # qwen2 encoder is GQA: 14 Q heads, 2 KV heads
+        self.gguf_writer.add_vision_head_count_kv(2)
+
+    def get_vision_config(self) -> dict[str, Any]:
+        vision_config = super().get_vision_config()
+        vision_config['hidden_size'] = vision_config['width']['qwen2-0-5b']['dim']
+        if vision_config.get('layers') is None:
+            vision_config['layers'] = 24
+        return vision_config
+
+
 @ModelBase.register("DeepseekForCausalLM")
 class DeepseekModel(TextModel):
     model_arch = gguf.MODEL_ARCH.DEEPSEEK
@@ -188,13 +224,21 @@ def __init__(self, *args, **kwargs):
         self.origin_hf_arch = hparams.get('architectures', [None])[0]
 
         # special handling for Deepseek OCR
-        if self.origin_hf_arch == "DeepseekOCRForCausalLM":
+        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
             self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
             self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
             self.gguf_writer.add_architecture()
             # default jinja template
             self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}")
 
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, _ = item
+        # DeepSeek-OCR vision encoder (SAM + DeepSeek-OCR-2 qwen2 tower)
+        if "sam_model" in name or "qwen2_model" in name:
+            return None
+        return super().filter_tensors(item)
+
     def set_vocab(self):
         try:
             self._set_vocab_gpt2()
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 92578490cb3..5a567e2d159 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -812,6 +812,8 @@ class MODEL_TENSOR(IntEnum):
     V_SAM_NET_3          = auto() # Deepseek-OCR
     V_ENC_EMBD_IMGNL     = auto() # Deepseek-OCR
     V_ENC_EMBD_VSEP      = auto() # Deepseek-OCR
+    V_RESMPL_QUERY_768   = auto() # Deepseek-OCR-2
+    V_RESMPL_QUERY_1024  = auto() # Deepseek-OCR-2
 
     # audio (mtmd)
     A_ENC_EMBD_POS        = auto()
@@ -1329,6 +1331,8 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_SAM_NET_3:               "v.sam.net_3",
     MODEL_TENSOR.V_ENC_EMBD_IMGNL:          "v.image_newline", # Deepseek-OCR
     MODEL_TENSOR.V_ENC_EMBD_VSEP:           "v.view_seperator", # Deepseek-OCR
+    MODEL_TENSOR.V_RESMPL_QUERY_768:        "v.resample_query_768", # Deepseek-OCR-2 qwen2
+    MODEL_TENSOR.V_RESMPL_QUERY_1024:       "v.resample_query_1024", # Deepseek-OCR-2 qwen2
     # audio (mtmd)
     # note: all audio tensor names must use prefix "a." or "mm.a."
     MODEL_TENSOR.A_ENC_EMBD_POS:            "a.position_embd",
@@ -1507,6 +1511,8 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_SAM_NECK,
         MODEL_TENSOR.V_SAM_NET_2,
         MODEL_TENSOR.V_SAM_NET_3,
+        MODEL_TENSOR.V_RESMPL_QUERY_768,
+        MODEL_TENSOR.V_RESMPL_QUERY_1024,
         # audio
         MODEL_TENSOR.A_ENC_EMBD_POS,
         MODEL_TENSOR.A_ENC_EMBD_NORM,
@@ -4329,6 +4335,7 @@ class VisionProjectorType:
     JANUS_PRO = "janus_pro"
     DOTSOCR = "dots_ocr"
     DEEPSEEKOCR = "deepseekocr"
+    DEEPSEEKOCR2 = "deepseekocr2"
     LFM2A = "lfm2a" # audio
     MUSIC_FLAMINGO = "musicflamingo" # audio
     GLM4V = "glm4v"
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index ecc3c05f99a..444f0f2855a 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1485,6 +1485,7 @@ class TensorNameMap:
             "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
             "model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
             "vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4
+            "model.qwen2_model.model.model.layers.{bid}.self_attn.q_proj" # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@@ -1509,6 +1510,7 @@ class TensorNameMap:
             "model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
             "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
             "vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4
+            "model.qwen2_model.model.model.layers.{bid}.self_attn.k_proj" # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@@ -1533,6 +1535,7 @@ class TensorNameMap:
             "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
             "model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
             "vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4
+            "model.qwen2_model.model.model.layers.{bid}.self_attn.v_proj" # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_INPUT_NORM: (
@@ -1554,6 +1557,7 @@ class TensorNameMap:
             "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
             "vision_tower.blocks.{bid}.norm1", # dots.ocr
             "vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
+            "model.qwen2_model.model.model.layers.{bid}.input_layernorm", # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1574,6 +1578,7 @@ class TensorNameMap:
             "model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
             "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
             "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
+            "model.qwen2_model.model.model.layers.{bid}.self_attn.o_proj", # Deepseek-OCR-2 qwen2
             "vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
             "vision_tower.blocks.{bid}.attn.proj", # dots.ocr
             "vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
@@ -1603,6 +1608,7 @@ class TensorNameMap:
             "vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
             "vision_tower.blocks.{bid}.norm2", # dots.ocr
             "vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
+            "model.qwen2_model.model.model.layers.{bid}.post_attention_layernorm", # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1625,6 +1631,7 @@ class TensorNameMap:
             "vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
             "vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
             "vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL
+            "model.qwen2_model.model.model.layers.{bid}.mlp.up_proj", # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_FFN_GATE: (
@@ -1632,6 +1639,7 @@ class TensorNameMap:
             "vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
             "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
             "vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4
+            "model.qwen2_model.model.model.layers.{bid}.mlp.gate_proj", # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_FFN_DOWN: (
@@ -1652,6 +1660,7 @@ class TensorNameMap:
             "model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
             "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
             "vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
+            "model.qwen2_model.model.model.layers.{bid}.mlp.down_proj" , # Deepseek-OCR-2 qwen2
             "vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
             "vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL
         ),
@@ -1699,6 +1708,7 @@ class TensorNameMap:
             "vision_tower.encoder.final_layernorm", # kimi-vl
             "visual.post_layernorm", # glm4v
             "siglip2.vision_model.post_layernorm",
+            "model.qwen2_model.model.model.norm", # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_MM_POST_NORM: (
@@ -1879,6 +1889,14 @@ class TensorNameMap:
             "model.sam_model.net_3",
         ),
 
+        MODEL_TENSOR.V_RESMPL_QUERY_768: (
+            "model.qwen2_model.query_768", # Deepseek-OCR-2 qwen2
+        ),
+
+        MODEL_TENSOR.V_RESMPL_QUERY_1024: (
+            "model.qwen2_model.query_1024", # Deepseek-OCR-2 qwen2
+        ),
+
         MODEL_TENSOR.V_MM_POST_FC_NORM: (
             "model.vision.linear_proj.norm1", # cogvlm
         ),
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index ffd30c7e6a1..14808d4221d 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -40,6 +40,7 @@ add_library(mtmd
             models/siglip.cpp
             models/whisper-enc.cpp
             models/deepseekocr.cpp
+            models/deepseekocr2.cpp
             models/mobilenetv5.cpp
             models/youtuvl.cpp
             models/yasa2.cpp
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index ef4c342ba86..14398dc4869 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -188,6 +188,8 @@
 #define TN_SAM_FFN_DOWN   "v.sam.blk.%d.mlp.lin2.%s"
 #define TN_SAM_NECK       "v.sam.neck.%d.%s"
 #define TN_SAM_NET        "v.sam.net_%d.%s"
+// deepseek-ocr-2
+#define TN_RESMPL_QUERY  "v.resample_query_%d.%s"
 // (conformer) lfm2
 #define TN_PRE_ENCODE_OUT  "a.pre_encode.out.%s"
 #define TN_FFN_NORM        "%s.blk.%d.ffn_norm.%s"
@@ -337,6 +339,7 @@ enum projector_type {
     PROJECTOR_TYPE_JANUS_PRO,
     PROJECTOR_TYPE_DOTS_OCR,
     PROJECTOR_TYPE_DEEPSEEKOCR,
+    PROJECTOR_TYPE_DEEPSEEKOCR2,
     PROJECTOR_TYPE_LFM2A,
     PROJECTOR_TYPE_GLM4V,
     PROJECTOR_TYPE_YOUTUVL,
@@ -386,6 +389,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
     { PROJECTOR_TYPE_DOTS_OCR,  "dots_ocr"},
     { PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
+    { PROJECTOR_TYPE_DEEPSEEKOCR2,"deepseekocr2"},
     { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
     { PROJECTOR_TYPE_GLM4V,     "glm4v"},
     { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
@@ -424,6 +428,9 @@ struct clip_image_f32 {
     int ny;
 
     std::vector<float> buf;
+
+    // marks the global view in e.g., DeepSeek-OCR Models
+    bool add_viewsep = false;
 };
 
 //
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index e0de41e0b5b..1f3657a8507 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -542,6 +542,11 @@ struct clip_model {
     int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder
 
     std::vector<clip_layer> sam_layers;
+
+    // deepseek-ocr-2
+    ggml_tensor * resample_query_768 = nullptr;
+    ggml_tensor * resample_query_1024 = nullptr;
+
     // lfm2 audio
     std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
     std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index a7aa297c598..7bb702b95c4 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -953,6 +953,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 builder = std::make_unique<clip_graph_deepseekocr>(ctx, img);
             } break;
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
+             {
+                builder = std::make_unique<clip_graph_deepseekocr2>(ctx, img);
+            } break;
         case PROJECTOR_TYPE_LFM2A:
             {
                 builder = std::make_unique<clip_graph_conformer>(ctx, img);
@@ -1514,6 +1518,7 @@ struct clip_model_loader {
                         hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup
                     } break;
                 case PROJECTOR_TYPE_DEEPSEEKOCR:
+                case PROJECTOR_TYPE_DEEPSEEKOCR2:
                     {
                         hparams.patch_size = 16;
                         hparams.image_size = 1024;
@@ -1525,6 +1530,10 @@ struct clip_model_loader {
                         get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
                         get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
                         get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
+                        if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+                            // qwen2 encoder is GQA, requires KEY_N_HEAD_KV
+                            get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
+                        }
                      } break;
                 case PROJECTOR_TYPE_HUNYUANVL:
                     {
@@ -2374,6 +2383,7 @@ struct clip_model_loader {
                     model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                 } break;
             case PROJECTOR_TYPE_DEEPSEEKOCR:
+            case PROJECTOR_TYPE_DEEPSEEKOCR2:
                 {
                     model.pos_embed          = get_tensor(string_format(TN_SAM_POS_EMBD,   "weight"));
                     model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight"));
@@ -2404,10 +2414,12 @@ struct clip_model_loader {
                     model.neck_3_w       = get_tensor(string_format(TN_SAM_NECK, 3, "weight"));
                     model.net_2          = get_tensor(string_format(TN_SAM_NET, 2, "weight"));
                     model.net_3          = get_tensor(string_format(TN_SAM_NET, 3, "weight"));
-                    model.image_newline  = get_tensor(TN_IMAGE_NEWLINE);
+                    model.image_newline  = get_tensor(TN_IMAGE_NEWLINE, false);
                     model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR);
                     model.mm_fc_w        = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
                     model.mm_fc_b        = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
+                    model.resample_query_768  = get_tensor(string_format(TN_RESMPL_QUERY, 768, "weight"), false);
+                    model.resample_query_1024 = get_tensor(string_format(TN_RESMPL_QUERY, 1024, "weight"), false);
                  } break;
             case PROJECTOR_TYPE_GEMMA4A:
                 {
@@ -3277,7 +3289,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_DEEPSEEKOCR:
         {
             // SAM encoder applies two stride-2 convolutions (net_2 and net_3)
-            // which reduces spatial dimensions by 4x in each direction (16x total)
+            // that reduce spatial dimensions by 4x in each direction (16x total)
             // E.g., 64x64 -> 16x16 patches
             n_patches /= 16;
 
@@ -3293,6 +3305,15 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                 int oh = (img->ny / patch_size) / merge;
                 n_patches = (ow + 1) * oh + 2;
             } break;
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
+        {
+            // 1024 global view -> 256 query tokens + 1 view separator = 257;
+            // 768 local tile   -> 144 query tokens, no separator.
+            n_patches /= 16;
+            if (img->add_viewsep) {
+                n_patches += 1; // view separator, appended only after the global view
+            }
+        } break;
         case PROJECTOR_TYPE_LFM2A:
             {
                 n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
@@ -3882,6 +3903,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 set_input_i32("pos_y", pos_y);
             } break;
         case PROJECTOR_TYPE_DEEPSEEKOCR:
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
             {
                 GGML_ASSERT(pos_w == pos_h);
 
@@ -3904,6 +3926,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
                 set_input_i32("rel_pos_indices_local", rel_pos_indices_local);
                 set_input_i32("rel_pos_indices_global", rel_pos_indices_global);
+
+                if (ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+
+                    // qwen2 encoder attention mask
+
+                    // num_image_tokens = num_patches / 16
+                    //   256 for 1024 global view
+                    //   144 for 768 tile views
+                    const int   num_image_tokens = num_patches / 16;
+                    const int   seq_len          = num_image_tokens * 2;
+                    std::vector qwen2_mask(static_cast<size_t>(seq_len) * seq_len, 0.0f);
+
+                    // attention mask layout
+                    //  +--------------+---------------+
+                    //  |    all 0     |   all -inf    |
+                    //  +--------------+---------------+
+                    //  |    all 0     |  lower tri 0  |
+                    //  +--------------+---------------+
+                    for (int i = 0; i < seq_len; i++) {
+                        for (int j = 0; j < seq_len; j++) {
+                            const bool zero = i < num_image_tokens ?
+                                                     j < num_image_tokens :
+                                                     j < num_image_tokens || j <= i;
+                            qwen2_mask[static_cast<size_t>(i) * seq_len + j] = zero ? 0.0f : -1e9f;
+                        }
+                    }
+                    set_input_f32("qwen2_attn_mask", qwen2_mask);
+                }
             } break;
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_GEMMA3NV:
@@ -4256,6 +4306,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         case PROJECTOR_TYPE_COGVLM:
             return ctx->model.mm_4h_to_h_w->ne[1];
         case PROJECTOR_TYPE_DEEPSEEKOCR:
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
             return ctx->model.mm_fc_w->ne[1];
         case PROJECTOR_TYPE_LFM2A:
             return ctx->model.position_embeddings->ne[0];
diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp
index 8419d496a5b..c3c22d0a4ba 100644
--- a/tools/mtmd/models/deepseekocr.cpp
+++ b/tools/mtmd/models/deepseekocr.cpp
@@ -157,7 +157,6 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
 
             cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
             cur = ggml_add(ctx0, cur, layer.qkv_b);
-            cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
             cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
 
             ggml_tensor * Q;
@@ -251,17 +250,17 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
     ggml_tensor * inp_raw = build_inp_raw();
     ggml_tensor * sam_out = build_sam(inp_raw);
 
+    const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
+
     ggml_tensor * clip_out;
     // Building DS-OCR CLIP
     {
         ggml_tensor * inp;
 
-        inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out));
-        inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]);
+        inp = ggml_reshape_2d(ctx0, sam_out, clip_n_patches, sam_out->ne[2]);
         inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
 
-        ggml_tensor * new_pos_embd =
-            ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings));
+        ggml_tensor * new_pos_embd = model.position_embeddings;
 
         int        n_pos    = new_pos_embd->ne[1];  // +1 for [CLS]
         const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
@@ -295,16 +294,12 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
         clip_out = cur;
     }
 
-    const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
-
     sam_out  = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
     sam_out  = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
     clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);
 
     ggml_tensor * cur;
     cur = ggml_concat(ctx0, clip_out, sam_out, 0);
-    cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches);
-    cur = ggml_cont(ctx0, cur);
     cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
     cur = ggml_add(ctx0, cur, model.mm_fc_b);
 
@@ -313,13 +308,11 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
     const auto n_dim = cur->ne[0];
 
     ggml_tensor * imgnl;
-    ggml_tensor * vs;
 
     imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
-    vs    = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1);  // (n_dim, 1)
     cur   = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
     cur   = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
-    cur   = ggml_concat(ctx0, cur, vs, 1);  // (n_dim, h*(w+1) + 1)
+    cur   = ggml_concat(ctx0, cur, model.view_seperator, 1);  // (n_dim, h*(w+1) + 1)
 
     cb(cur, "dsocr_output", -1);
 
diff --git a/tools/mtmd/models/deepseekocr2.cpp b/tools/mtmd/models/deepseekocr2.cpp
new file mode 100644
index 00000000000..056bb81807f
--- /dev/null
+++ b/tools/mtmd/models/deepseekocr2.cpp
@@ -0,0 +1,81 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_deepseekocr2::build() {
+    GGML_ASSERT(hparams.n_head_kv > 0);
+    GGML_ASSERT(n_head % hparams.n_head_kv == 0);
+
+    // patch embedding
+    ggml_tensor * inp_raw = build_inp_raw();
+
+    ggml_tensor * sam_out = build_sam(inp_raw);
+
+    ggml_tensor * qwen2_out;
+    // Building Qwen2 encoder
+    {
+        ggml_tensor * inp;
+
+        inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C
+        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+
+        auto num_image_tokens = inp->ne[1]; // H*W
+        GGML_ASSERT(num_image_tokens == 144 || num_image_tokens == 256);
+
+        // query based on numbers of image tokens (in SAM output)
+        // 16x16 -> query_1024 (1024x1024 images)
+        // 12x12 -> query_768 (768x768 images)
+
+        ggml_tensor * query_embed = model.resample_query_1024;
+        int           num_queries = 256;
+
+        if (num_image_tokens == 144) {
+            query_embed = model.resample_query_768;
+            num_queries = 144;
+        }
+
+        // (B, num_image_tokens + num_queries, C)
+        inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1);
+
+        auto seq_len = inp->ne[1];
+
+        // qwen2 encoder attention mask
+        ggml_tensor * attn_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, seq_len, seq_len);
+        ggml_set_name(attn_mask, "qwen2_attn_mask");
+        ggml_set_input(attn_mask);
+
+        ggml_tensor * inp_pos = ggml_cast(ctx0, ggml_arange(ctx0, 0, seq_len, 1), GGML_TYPE_I32);
+
+        auto add_rope = [&](ggml_tensor * x, const clip_layer &) {
+            return ggml_rope_ext(ctx0, x, inp_pos, nullptr, d_head,
+                                 GGML_ROPE_TYPE_NEOX, 131072, 1000000, 1, 0, 1, 0, 0);
+        };
+
+        build_vit_opts vit_opts;
+        vit_opts.attn_mask = attn_mask;
+
+        // build_vit applies model.post_ln_w internally; do not re-apply
+        ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU,
+                                      /* learned_pos_embd */ nullptr, add_rope, vit_opts);
+
+        cur = ggml_cont(ctx0,
+                        ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1],
+                                     cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output
+
+        ggml_build_forward_expand(gf, cur);
+        qwen2_out = cur;
+    }
+
+    ggml_tensor * cur;
+
+    cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out);
+    cur = ggml_add(ctx0, cur, model.mm_fc_b);
+
+    // view_seperator only after the global view
+    if (img.add_viewsep) {
+        cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257)
+    }
+
+    cb(cur, "dsocr2_output", -1);
+
+    ggml_build_forward_expand(gf, cur);
+    return gf;
+}
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index 119c2d541b5..a856882c275 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -121,6 +121,11 @@ struct clip_graph_deepseekocr : clip_graph {
     ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model
 };
 
+struct clip_graph_deepseekocr2 : clip_graph_deepseekocr {
+    clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_deepseekocr(ctx, img) {}
+    ggml_cgraph * build() override; // reuses build_sam() from base
+};
+
 struct clip_graph_conformer : clip_graph {
     clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp
index 37c271d18a8..caf72d53621 100644
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@@ -1137,6 +1137,105 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
     return true;
 }
 
+//
+// mtmd_image_preprocessor_deepseekocr2
+//
+
+// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles
+// sorted by tile count
+std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ratios() {
+    std::vector<clip_image_size> ratios;
+    for (int n = min_tiles; n <= max_tiles; n++) {
+        for (int w = 1; w <= n; w++) {
+            for (int h = 1; h <= n; h++) {
+                if (w * h < min_tiles || w * h > max_tiles) {
+                    continue;
+                }
+                bool found = false;
+                for (const auto & r : ratios) {
+                    if (r.width == w && r.height == h) {
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    ratios.push_back({ w, h });
+                }
+            }
+        }
+    }
+    std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
+        return a.width * a.height < b.width * b.height;
+    });
+    return ratios;
+}
+
+// pick the grid whose aspect ratio is closest to the image
+// on a tie, prefer the larger grid when the image fits
+clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
+    float                                aspect_ratio,
+    const std::vector<clip_image_size> & target_ratios,
+    int                                  width,
+    int                                  height) {
+    float           best_ratio_diff = std::numeric_limits<float>::max();
+    clip_image_size best_ratio      = { 1, 1 };
+    const float     area            = static_cast<float>(width * height);
+
+    for (const auto & ratio : target_ratios) {
+        const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
+        const float ratio_diff          = std::abs(aspect_ratio - target_aspect_ratio);
+        if (ratio_diff < best_ratio_diff) {
+            best_ratio_diff = ratio_diff;
+            best_ratio      = ratio;
+        } else if (ratio_diff == best_ratio_diff) {
+            const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
+            if (area > 0.5f * target_area) {
+                best_ratio = ratio;
+            }
+        }
+    }
+    return best_ratio;
+}
+
+bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    // emit 768x768 local tiles when the image is larger than a tile in either
+    // dimension, then always a 1024x1024 global view. order: [tiles..., global].
+
+    if (img.nx > tile_size || img.ny > tile_size) {
+        const float           aspect_ratio  = static_cast<float>(img.nx) / img.ny;
+        const auto            target_ratios = get_target_ratios();
+        const clip_image_size grid          = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny);
+
+        // stretch onto the grid (no aspect preserve), then crop tiles row-major.
+        clip_image_u8 refined;
+        img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height },
+                         RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE);
+
+        for (int row = 0; row < grid.height; row++) {
+            for (int col = 0; col < grid.width; col++) {
+                clip_image_u8 tile;
+                img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size);
+                clip_image_f32_ptr res(clip_image_f32_init());
+                img_u8_to_f32(tile, *res, hparams.image_mean, hparams.image_std);
+                output.entries.push_back(std::move(res));
+            }
+        }
+    }
+
+    // global view: aspect-preserving fit-and-pad to base_size.
+    clip_image_u8 padded;
+    img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
+                     PAD_NEAREST, hparams.image_pad_color);
+    clip_image_f32_ptr global(clip_image_f32_init());
+    img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std);
+    global->add_viewsep = true;
+    output.entries.push_back(std::move(global));
+
+    output.grid_x = 1;
+    output.grid_y = 1;
+    return true;
+}
+
 //
 // mtmd_image_preprocessor_step3vl
 //
diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h
index 08129a08ed5..91a5bc253ef 100644
--- a/tools/mtmd/mtmd-image.h
+++ b/tools/mtmd/mtmd-image.h
@@ -144,6 +144,26 @@ struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
     bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
 
+// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local
+// tiles when the image is larger than a tile in either dimension.
+struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor {
+    static constexpr int base_size = 1024; // global view
+    static constexpr int tile_size = 768;  // local tile
+    static constexpr int min_tiles = 2;
+    static constexpr int max_tiles = 6;
+
+    mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+
+private:
+    static std::vector<clip_image_size> get_target_ratios();
+    static clip_image_size              find_closest_aspect_ratio(
+        float                                aspect_ratio,
+        const std::vector<clip_image_size> & target_ratios,
+        int                                  width,
+        int                                  height);
+};
+
 // custom image preprocessing for Step3VL
 // ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py
 struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd {
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 63b7e4d052a..b3401634fd6 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -493,6 +493,11 @@ struct mtmd_context {
                     img_end = "\n"; // prevent empty batch on llama-server
                     image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                 } break;
+            case PROJECTOR_TYPE_DEEPSEEKOCR2:
+                {
+                    img_end = "\n"; // prevent empty batch on llama-server
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
+                } break;
             case PROJECTOR_TYPE_HUNYUANVL:
                 {
                     // note: these use fullwidth ｜ (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
@@ -1091,16 +1096,21 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
     if (clip_is_llava(ctx_clip)
         || proj_type == PROJECTOR_TYPE_MINICPMV
         || proj_type == PROJECTOR_TYPE_GLM_EDGE
-        || proj_type == PROJECTOR_TYPE_INTERNVL) {
+        || proj_type == PROJECTOR_TYPE_INTERNVL
+        || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
+        // entries may have different token counts
+        // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
+        size_t offset = 0;
         for (size_t i = 0; i < entries.size(); i++) {
             int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
             ok = clip_image_encode(
                 ctx_clip,
                 ctx->n_threads,
                 entries[i].get(),
-                ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
+                ctx->image_embd_v.data() + offset);
+            offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
         }
     } else {
         ok = clip_image_batch_encode(
diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py
index 5c1980271b8..5f5fef765a6 100644
--- a/tools/mtmd/tests/test-deepseek-ocr.py
+++ b/tools/mtmd/tests/test-deepseek-ocr.py
@@ -3,7 +3,7 @@
 Evaluates llama.cpp's DeepSeek-OCR by comparing its output for a test
 image to the actual text in part of that image.
 
-Runs the test image through mtmd-cli, calculates CER and chrF for
+Runs each test image through mtmd-cli, calculates CER and chrF for
 its output, and holds them against the HF model's scores.
 """
 
@@ -12,24 +12,81 @@
 import subprocess
 import sys
 import unicodedata
+from dataclasses import dataclass
 from pathlib import Path
 
 logger = logging.getLogger("deepseek-ocr-test")
 
-DEFAULT_IMAGE = "test-1.jpeg"
-DEFAULT_EXPECTED_TEXT = "test-1-ground-truth.txt"
 RUN_TIMEOUT = 300
 
-# DeepSeek-OCR reference scores on the test image.
-# This is the baseline the implementation should keep up with.
-HF_REFERENCE_CER = 0.3030
-HF_REFERENCE_CHRF = 67.52
 
-CER_TOLERANCE = 0.02
-CHRF_TOLERANCE = 2.0
-
-CER_MAX = HF_REFERENCE_CER + CER_TOLERANCE
-CHRF_MIN = HF_REFERENCE_CHRF - CHRF_TOLERANCE
+@dataclass
+class ModelSpec:
+    key: str
+    label: str
+    model_arg: str
+    mmproj_arg: str
+    model_default: str
+    mmproj_default: str
+
+
+@dataclass
+class TestCase:
+    model_key: str
+    label: str
+    image: str
+    ground_truth: str
+    hf_cer: float
+    hf_chrf: float
+    cer_tol: float
+    chrf_tol: float
+
+    @property
+    def cer_max(self) -> float:
+        return self.hf_cer + self.cer_tol
+
+    @property
+    def chrf_min(self) -> float:
+        return self.hf_chrf - self.chrf_tol
+
+
+MODELS = {
+    "v1": ModelSpec(
+        key="v1", label="DeepSeek-OCR",
+        model_arg="--llama-model", mmproj_arg="--mmproj",
+        model_default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
+        mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
+    ),
+    "v2": ModelSpec(
+        key="v2", label="DeepSeek-OCR-2",
+        model_arg="--llama-model-2", mmproj_arg="--mmproj-2",
+        model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf",
+        mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf",
+    ),
+}
+
+CASES = [
+    TestCase(
+        model_key="v1", label="single-view scan",
+        image="tools/mtmd/test-1.jpeg",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        hf_cer=0.3030, hf_chrf=67.52, cer_tol=0.02, chrf_tol=2.0,
+    ),
+    TestCase(
+        model_key="v2", label="single-view scan",
+        image="tools/mtmd/test-1.jpeg",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        # 640x488 is below the 768 tiling threshold -- single 1024 global view.
+        # hf_cer/hf_chrf are the deepseek-ai repo's own scores (ImageOps.pad);
+        # the transformers HF processor is *not* the reference -- its pad_to_square
+        # is one pixel off and lands at ~0.69 instead.
+        hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
+    ),
+]
+
+
+def arg_dest(flag: str) -> str:
+    return flag.lstrip("-").replace("-", "_")
 
 
 def verdict(ok: bool) -> str:
@@ -84,6 +141,14 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
         "--temp", "0",
         "--flash-attn", "off",  # match the HF "eager" attention reference
         "--no-warmup",
+        "-n", "512",  # cap loops on hard images (KV would otherwise fill)
+        # HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY.
+        # Default DRY breakers include "\n", so they are cleared below.
+        "--dry-multiplier", "0.8",
+        "--dry-base", "1.75",
+        "--dry-allowed-length", "2",
+        "--dry-penalty-last-n", "-1",
+        "--dry-sequence-breaker", "none",
     ]
     logger.debug(f"  command: {' '.join(cmd)}")
 
@@ -110,7 +175,7 @@ def read_expected_text(file_path: Path) -> str:
         return f.read().strip()
 
 
-def evaluate(expected: str, ocr_out: str) -> bool:
+def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool:
     expected = normalize_text(expected)
     ocr_out = normalize_text(ocr_out)
     aligned = locally_align(expected, ocr_out)
@@ -122,16 +187,16 @@ def evaluate(expected: str, ocr_out: str) -> bool:
     cer = compute_cer(expected, aligned)
     chrf = compute_chrf(expected, aligned)
 
-    cer_pass = cer <= CER_MAX
-    chrf_pass = chrf >= CHRF_MIN
+    cer_pass = cer <= case.cer_max
+    chrf_pass = chrf >= case.chrf_min
     passed = cer_pass and chrf_pass
 
     logger.info("")
     logger.info("=" * 60)
     logger.info("Free OCR evaluation:")
     logger.info("=" * 60)
-    logger.info(f"  CER               {cer:>7.4f}    (<= {CER_MAX:>7.4f}  -> {verdict(cer_pass)})")
-    logger.info(f"  chrF (0-100)      {chrf:>7.2f}    (>= {CHRF_MIN:>7.2f}  -> {verdict(chrf_pass)})")
+    logger.info(f"  CER               {cer:>7.4f}    (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f}  -> {verdict(cer_pass)})")
+    logger.info(f"  chrF (0-100)      {chrf:>7.2f}    (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f}  -> {verdict(chrf_pass)})")
     logger.info(f"  Expected chars    {len(expected):>7}")
     logger.info(f"  Aligned chars     {len(aligned):>7} (of {len(ocr_out)} OCR chars)")
     logger.info("")
@@ -142,12 +207,13 @@ def evaluate(expected: str, ocr_out: str) -> bool:
 
 def argument_parser() -> argparse.ArgumentParser:
     ap = argparse.ArgumentParser(description="Compare llama.cpp DeepSeek-OCR output with a ground-truth transcript")
-    ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
-                    help="Path to llama.cpp GGUF model (relative to repo root or absolute)")
-    ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
-                    help="Path to mmproj GGUF file (relative to repo root or absolute)")
     ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
                     help="Path to llama-mtmd-cli binary (relative to repo root or absolute)")
+    for spec in MODELS.values():
+        ap.add_argument(spec.model_arg, default=spec.model_default,
+                        help=f"Path to the {spec.label} GGUF model (relative to repo root or absolute)")
+        ap.add_argument(spec.mmproj_arg, default=spec.mmproj_default,
+                        help=f"Path to the {spec.label} mmproj GGUF file (relative to repo root or absolute)")
     ap.add_argument("--verbose", action="store_true",
                     help="Also log the expected, OCR, and aligned text")
     return ap
@@ -167,53 +233,60 @@ def main() -> int:
     args = argument_parser().parse_args()
     configure_logging(args.verbose)
 
-    tests_dir = Path(__file__).parent  # tools/mtmd/tests
-    mtmd_dir = tests_dir.parent  # tools/mtmd
-    repo_root = mtmd_dir.parent.parent  # repo root
+    repo_root = Path(__file__).resolve().parents[3]  # tests -> mtmd -> tools -> repo root
+    binary = resolve_path(args.llama_bin, repo_root)
 
-    inputs = [
-        ("image", resolve_path(DEFAULT_IMAGE, mtmd_dir)),
-        ("expected-text", resolve_path(DEFAULT_EXPECTED_TEXT, tests_dir)),
-        ("model", resolve_path(args.llama_model, repo_root)),
-        ("mmproj", resolve_path(args.mmproj, repo_root)),
-        ("binary", resolve_path(args.llama_bin, repo_root)),
-    ]
-    for label, path in inputs:
-        if not path.exists():
-            logger.error(f"Error: {label} not found: {path}")
-            return 1
-    paths = dict(inputs)
+    if not binary.exists():
+        logger.error(f"Error: binary not found: {binary}")
+        return 1
 
     logger.info("=" * 60)
-    logger.info("DeepSeek-OCR: llama.cpp vs ground-truth comparison")
+    logger.info("DeepSeek-OCR: llama.cpp vs HF parity check")
     logger.info("=" * 60)
-    logger.info(f"HF baselines: CER {HF_REFERENCE_CER:.4f}, chrF {HF_REFERENCE_CHRF:.2f}")
-    logger.info(f"Test thresholds: CER <= {CER_MAX:.4f}, chrF >= {CHRF_MIN:.2f}")
 
-    logger.debug("")
-    logger.debug("Resolved test inputs:")
-    for label, path in inputs:
-        logger.debug(f"  {label:<14} {path}")
-
-    logger.info("")
-    logger.info("[1/3] Running llama.cpp 'Free OCR'")
-    try:
-        ocr_out = run_mtmd_cli(paths["model"], paths["mmproj"],
-                               paths["image"], paths["binary"])
-    except RuntimeError as e:
-        logger.error(f"Error: {e}")
-        return 1
-
-    logger.info("")
-    logger.info("[2/3] Reading expected output")
-    expected = read_expected_text(paths["expected-text"])
-    logger.info(f"  expected: {len(expected)} chars")
+    results = {}
+    for case in CASES:
+        model_spec = MODELS[case.model_key]
+        title = f"{model_spec.label} -- {case.label}"
+
+        logger.info("")
+        logger.info(f"=== {title} ===")
+
+        model = resolve_path(getattr(args, arg_dest(model_spec.model_arg)), repo_root)
+        mmproj = resolve_path(getattr(args, arg_dest(model_spec.mmproj_arg)), repo_root)
+        image = resolve_path(case.image, repo_root)
+        ground_truth = resolve_path(case.ground_truth, repo_root)
+
+        missing = [(lbl, p) for lbl, p in [("model", model), ("mmproj", mmproj),
+                                           ("image", image), ("ground-truth", ground_truth)]
+                   if not p.exists()]
+        if missing:
+            for lbl, p in missing:
+                logger.error(f"  Error: {lbl} not found: {p}")
+            results[title] = False
+            continue
+
+        expected = read_expected_text(ground_truth)
+        logger.info(f"  Image: {case.image}")
+        logger.info(f"  Expected text: {len(expected)} chars")
+        logger.info("  Running llama.cpp 'Free OCR'")
+        try:
+            ocr_out = run_mtmd_cli(model, mmproj, image, binary)
+        except RuntimeError as e:
+            logger.error(f"  Error: {e}")
+            results[title] = False
+            continue
+
+        results[title] = evaluate(case, expected, ocr_out)
 
     logger.info("")
-    logger.info("[3/3] Computing OCR metrics")
-    ok = evaluate(expected, ocr_out)
+    logger.info("=== Summary ===")
+    for title, ok in results.items():
+        logger.info(f"  {title:<48} {verdict(ok)}")
+    all_passed = all(results.values())
+    logger.info(f"Overall: {verdict(all_passed)}")
 
-    return 0 if ok else 1
+    return 0 if all_passed else 1
 
 
 if __name__ == "__main__":

From 06d26dfdff4097dc51eac20155371a9cfd53e094 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Fri, 29 May 2026 16:30:55 +0200
Subject: [PATCH 31/50] download: add option to skip_download (#23059)

* download: add option to skip_download

* fix

* fix 2

* if file doesn't exist, respect skip_download flag
---
 common/arg.cpp                 | 74 +++++++++++++++++++---------------
 common/arg.h                   |  7 +++-
 common/common.h                |  3 +-
 common/download.cpp            | 26 ++++++++----
 common/download.h              | 13 ++++--
 tools/server/README.md         | 15 +++++--
 tools/server/server-models.cpp | 70 +++++++++++++++++---------------
 tools/server/server-models.h   |  1 +
 8 files changed, 126 insertions(+), 83 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 51631765fa3..e0f6c606608 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -340,9 +340,7 @@ struct handle_model_result {
 };
 
 static handle_model_result common_params_handle_model(struct common_params_model & model,
-                                                      const std::string          & bearer_token,
-                                                      bool                         offline,
-                                                      bool                         search_mtp = false) {
+                                                      const common_download_opts & opts) {
     handle_model_result result;
 
     if (!model.docker_repo.empty()) {
@@ -354,10 +352,9 @@ static handle_model_result common_params_handle_model(struct common_params_model
             model.hf_file = model.path;
             model.path = "";
         }
-        common_download_opts opts;
-        opts.bearer_token = bearer_token;
-        opts.offline = offline;
-        auto download_result = common_download_model(model, opts, true, search_mtp);
+        common_download_opts hf_opts = opts;
+        hf_opts.download_mmproj = true; // also look for mmproj when downloading hf model
+        auto download_result = common_download_model(model, hf_opts);
 
         if (download_result.model_path.empty()) {
             throw std::runtime_error("failed to download model from Hugging Face");
@@ -382,9 +379,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
             model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
         }
 
-        common_download_opts opts;
-        opts.bearer_token = bearer_token;
-        opts.offline = offline;
         auto download_result = common_download_model(model, opts);
         if (download_result.model_path.empty()) {
             throw std::runtime_error("failed to download model from " + model.url);
@@ -441,35 +435,49 @@ static bool parse_bool_value(const std::string & value) {
 // CLI argument parsing functions
 //
 
-void common_params_handle_models(common_params & params, llama_example curr_ex) {
+bool common_params_handle_models(common_params & params, llama_example curr_ex) {
     const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
                                          params.speculative.types.end(),
                                          COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
 
-    auto res = common_params_handle_model(params.model, params.hf_token, params.offline, spec_type_draft_mtp);
-    if (params.no_mmproj) {
-        params.mmproj = {};
-    } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
-        // optionally, handle mmproj model when -hf is specified
-        params.mmproj = res.mmproj;
-    }
-    // only download mmproj if the current example is using it
-    for (const auto & ex : mmproj_examples) {
-        if (curr_ex == ex) {
-            common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
-            break;
+    common_download_opts opts;
+    opts.bearer_token  = params.hf_token;
+    opts.offline       = params.offline;
+    opts.skip_download = params.skip_download;
+    opts.download_mtp  = spec_type_draft_mtp;
+
+    try {
+        auto res = common_params_handle_model(params.model, opts);
+        if (params.no_mmproj) {
+            params.mmproj = {};
+        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+            // optionally, handle mmproj model when -hf is specified
+            params.mmproj = res.mmproj;
+        }
+        // only download mmproj if the current example is using it
+        for (const auto & ex : mmproj_examples) {
+            if (curr_ex == ex) {
+                common_params_handle_model(params.mmproj, opts);
+                break;
+            }
         }
+
+        // when --spec-type mtp is set and no draft model was provided explicitly,
+        // fall back to the MTP head discovered alongside the -hf model
+        if (spec_type_draft_mtp && res.found_mtp &&
+            params.speculative.draft.mparams.path.empty() &&
+            params.speculative.draft.mparams.hf_repo.empty() &&
+            params.speculative.draft.mparams.url.empty()) {
+            params.speculative.draft.mparams.path = res.mtp.path;
+        }
+        common_params_handle_model(params.speculative.draft.mparams, opts);
+        common_params_handle_model(params.vocoder.model,             opts);
+        return true;
+    } catch (const common_skip_download_exception &) {
+        return false;
+    } catch (const std::exception &) {
+        throw;
     }
-    // when --spec-type mtp is set and no draft model was provided explicitly,
-    // fall back to the MTP head discovered alongside the -hf model
-    if (spec_type_draft_mtp && res.found_mtp &&
-        params.speculative.draft.mparams.path.empty() &&
-        params.speculative.draft.mparams.hf_repo.empty() &&
-        params.speculative.draft.mparams.url.empty()) {
-        params.speculative.draft.mparams.path = res.mtp.path;
-    }
-    common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
-    common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
 }
 
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
diff --git a/common/arg.h b/common/arg.h
index 2a85f09f3eb..0010f2a9ac9 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -129,8 +129,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);
 
-// Populate model paths (main model, mmproj, etc) from -hf if necessary
-void common_params_handle_models(common_params & params, llama_example curr_ex);
+// populate model paths (main model, mmproj, etc) from -hf if necessary
+// return true if the model is ready to use
+// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc)
+// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed)
+bool common_params_handle_models(common_params & params, llama_example curr_ex);
 
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
diff --git a/common/common.h b/common/common.h
index 9855d3f3694..99898800d1d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -479,7 +479,7 @@ struct common_params {
 
     std::set<std::string> model_alias;     // model aliases                                                 // NOLINT
     std::set<std::string> model_tags;      // model tags (informational, not used for routing)              // NOLINT
-    std::string hf_token             = ""; // HF token                                                      // NOLINT
+    std::string hf_token             = ""; // HF token (aka bearer token)                                   // NOLINT
     std::string prompt               = "";                                                                  // NOLINT
     std::string system_prompt        = "";                                                                  // NOLINT
     std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
@@ -507,6 +507,7 @@ struct common_params {
     int32_t control_vector_layer_start = -1; // layer range for control vector
     int32_t control_vector_layer_end   = -1; // layer range for control vector
     bool    offline                    = false;
+    bool    skip_download              = false; // skip model file downloading
 
     int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
diff --git a/common/download.cpp b/common/download.cpp
index 103bc408faf..40f6eb780f4 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -292,6 +292,10 @@ static int common_download_file_single_online(const std::string & url,
 
     const bool file_exists = std::filesystem::exists(path);
 
+    if (!file_exists && opts.skip_download) {
+        return -2; // file is missing and download is disabled
+    }
+
     if (file_exists && skip_etag) {
         LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
         return 304; // 304 Not Modified - fake cached response
@@ -357,6 +361,10 @@ static int common_download_file_single_online(const std::string & url,
             LOG_DBG("%s: using cached file (same etag): %s\n", __func__, path.c_str());
             return 304; // 304 Not Modified - fake cached response
         }
+        // pass this point, the file exists but is different from the server version, so we need to redownload it
+        if (opts.skip_download) {
+            return -2; // special code to indicate that the download was skipped due to etag mismatch
+        }
         if (remove(path.c_str()) != 0) {
             LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
             return -1;
@@ -775,13 +783,13 @@ static std::vector<download_task> get_url_tasks(const common_params_model & mode
 }
 
 common_download_model_result common_download_model(const common_params_model  & model,
-                                                   const common_download_opts & opts,
-                                                   bool download_mmproj,
-                                                   bool download_mtp) {
+                                                   const common_download_opts & opts) {
     common_download_model_result result;
     std::vector<download_task> tasks;
     hf_plan hf;
 
+    bool download_mmproj = opts.download_mmproj;
+    bool download_mtp = opts.download_mtp;
     bool is_hf = !model.hf_repo.empty();
 
     if (is_hf) {
@@ -806,18 +814,22 @@ common_download_model_result common_download_model(const common_params_model  &
         return result;
     }
 
-    std::vector<std::future<bool>> futures;
+    std::vector<std::future<int>> futures;
     for (const auto & task : tasks) {
         futures.push_back(std::async(std::launch::async,
             [&task, &opts, is_hf]() {
-                int status = common_download_file_single(task.url, task.path, opts, is_hf);
-                return is_http_status_ok(status);
+                return common_download_file_single(task.url, task.path, opts, is_hf);
             }
         ));
     }
 
     for (auto & f : futures) {
-        if (!f.get()) {
+        int status = f.get();
+        if (status == -2 && opts.skip_download) {
+            throw common_skip_download_exception();
+        }
+        bool is_ok = is_http_status_ok(status);
+        if (!is_ok) {
             return {};
         }
     }
diff --git a/common/download.h b/common/download.h
index 4a169ef7796..ebeedd6058c 100644
--- a/common/download.h
+++ b/common/download.h
@@ -52,6 +52,9 @@ struct common_download_opts {
     std::string bearer_token;
     common_header_list headers;
     bool offline = false;
+    bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid
+    bool download_mmproj = false;
+    bool download_mtp = false;
     common_download_callback * callback = nullptr;
 };
 
@@ -62,6 +65,11 @@ struct common_download_model_result {
     std::string mtp_path;
 };
 
+// throw if the file is missing or invalid (e.g. ETag check failed)
+struct common_skip_download_exception : public std::runtime_error {
+    common_skip_download_exception() : std::runtime_error("skip download") {}
+};
+
 // Download model from HuggingFace repo or URL
 //
 // input (via model struct):
@@ -89,9 +97,7 @@ struct common_download_model_result {
 // returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
 common_download_model_result common_download_model(
     const common_params_model & model,
-    const common_download_opts & opts = {},
-    bool download_mmproj = false,
-    bool download_mtp    = false
+    const common_download_opts & opts = {}
 );
 
 // returns list of cached models
@@ -99,6 +105,7 @@ std::vector<common_cached_model_info> common_list_cached_models();
 
 // download single file from url to local path
 // returns status code or -1 on error
+// returns -2 if the download was skipped due to ETag mismatch (file outdated, skip_download=true)
 // skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
 int common_download_file_single(const std::string & url,
                                 const std::string & path,
diff --git a/tools/server/README.md b/tools/server/README.md
index 7870e3091ea..87600d9be2f 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1661,23 +1661,30 @@ Listing all models in cache. The model metadata will also include a field to ind
 {
   "data": [{
     "id": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
-    "in_cache": true,
     "path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf",
     "status": {
       "value": "loaded",
       "args": ["llama-server", "-ctx", "4096"]
     },
+    "architecture": {
+      "input_modalities": [
+        "text",
+        "image"
+      ],
+      "output_modalities": [
+        "text"
+      ]
+    },
     ...
   }]
 }
 ```
 
 Note:
-1. For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
-2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
+1. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
     - If a model is running but updated or removed from the source, it will be unloaded
     - If a model is not running, it will be added or updated according to the source
-3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.
+2. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.
 
 The `status` object can be:
 
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 47b6c2a4ec0..49b0e423f46 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -180,7 +180,8 @@ void server_model_meta::update_caps() {
             "LLAMA_ARG_HF_REPO",
             "LLAMA_ARG_HF_REPO_FILE",
         });
-        params.offline = true; // avoid any unwanted network call during capability detection
+        params.offline = true;
+        // params.skip_download = true; // TODO: ideally, we should validate the model here, but it takes too much time
         common_params_handle_models(params, LLAMA_EXAMPLE_SERVER);
         if (params.mmproj.path.empty()) {
             multimodal = { false, false };
@@ -371,18 +372,19 @@ void server_models::load_models() {
         // FIRST LOAD: add all models, then unlock for autoloading
         for (const auto & [name, preset] : final_presets) {
             server_model_meta meta{
-                /* preset       */ preset,
-                /* name         */ name,
-                /* aliases      */ {},
-                /* tags         */ {},
-                /* port         */ 0,
-                /* status       */ SERVER_MODEL_STATUS_UNLOADED,
-                /* last_used    */ 0,
-                /* args         */ std::vector<std::string>(),
-                /* loaded_info  */ {},
-                /* exit_code    */ 0,
-                /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
-                /* multimodal   */ mtmd_caps{false, false},
+                /* preset        */ preset,
+                /* name          */ name,
+                /* aliases       */ {},
+                /* tags          */ {},
+                /* port          */ 0,
+                /* status        */ SERVER_MODEL_STATUS_UNLOADED,
+                /* last_used     */ 0,
+                /* args          */ std::vector<std::string>(),
+                /* loaded_info   */ {},
+                /* exit_code     */ 0,
+                /* stop_timeout  */ DEFAULT_STOP_TIMEOUT,
+                /* multimodal    */ mtmd_caps{false, false},
+                /* need_download */ false,
             };
             add_model(std::move(meta));
         }
@@ -524,18 +526,19 @@ void server_models::load_models() {
         for (const auto & [name, preset] : final_presets) {
             if (mapping.find(name) == mapping.end()) {
                 server_model_meta meta{
-                    /* preset       */ preset,
-                    /* name         */ name,
-                    /* aliases      */ {},
-                    /* tags         */ {},
-                    /* port         */ 0,
-                    /* status       */ SERVER_MODEL_STATUS_UNLOADED,
-                    /* last_used    */ 0,
-                    /* args         */ std::vector<std::string>(),
-                    /* loaded_info  */ {},
-                    /* exit_code    */ 0,
-                    /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
-                    /* multimodal   */ mtmd_caps{false, false},
+                    /* preset        */ preset,
+                    /* name          */ name,
+                    /* aliases       */ {},
+                    /* tags          */ {},
+                    /* port          */ 0,
+                    /* status        */ SERVER_MODEL_STATUS_UNLOADED,
+                    /* last_used     */ 0,
+                    /* args          */ std::vector<std::string>(),
+                    /* loaded_info   */ {},
+                    /* exit_code     */ 0,
+                    /* stop_timeout  */ DEFAULT_STOP_TIMEOUT,
+                    /* multimodal    */ mtmd_caps{false, false},
+                    /* need_download */ false,
                 };
                 add_model(std::move(meta));
                 newly_added.push_back(name);
@@ -1263,14 +1266,15 @@ void server_models_routes::init_routes() {
             };
 
             json model_info = json {
-                {"id",           meta.name},
-                {"aliases",      meta.aliases},
-                {"tags",         meta.tags},
-                {"object",       "model"},    // for OAI-compat
-                {"owned_by",     "llamacpp"}, // for OAI-compat
-                {"created",      t},          // for OAI-compat
-                {"status",       status},
-                {"architecture", architecture},
+                {"id",            meta.name},
+                {"aliases",       meta.aliases},
+                {"tags",          meta.tags},
+                {"object",        "model"},    // for OAI-compat
+                {"owned_by",      "llamacpp"}, // for OAI-compat
+                {"created",       t},          // for OAI-compat
+                {"status",        status},
+                {"architecture",  architecture},
+                {"need_download", meta.need_download},
                 // TODO: add other fields, may require reading GGUF metadata
             };
 
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index e96d76c9169..2198589a7aa 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -67,6 +67,7 @@ struct server_model_meta {
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
     mtmd_caps multimodal; // multimodal capabilities
+    bool need_download = false; // whether the model needs to be downloaded before loading
 
     bool is_ready() const {
         return status == SERVER_MODEL_STATUS_LOADED;

From dc71236b6c8947c34860f2be43ac0e8821e87d9c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 29 May 2026 20:41:57 +0300
Subject: [PATCH 32/50] ci : update macos release to use macos-26 runner
 (#23878)

---
 .github/workflows/release.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index c3a018425e2..08f02af63ec 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -66,7 +66,7 @@ jobs:
         include:
           - build: 'arm64'
             arch: 'arm64'
-            os: macos-14
+            os: macos-26
             defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
           # TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23780)
           #       in order to enable it again, we have to provision dedicated runners  to run it
@@ -1134,7 +1134,7 @@ jobs:
   ios-xcode-build:
     needs: [check_release]
     if: ${{ needs.check_release.outputs.should_release == 'true' }}
-    runs-on: macos-15
+    runs-on: macos-26
 
     steps:
       - name: Checkout code

From b5f52280fb781cf63e7c3fb79f8bb8de215293e3 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Fri, 29 May 2026 19:47:30 +0200
Subject: [PATCH 33/50] server: remove obsolete scripts (#23870)

---
 tools/server/chat-llama2.sh | 109 ------------------------------
 tools/server/chat.mjs       | 131 ------------------------------------
 tools/server/chat.sh        |  80 ----------------------
 3 files changed, 320 deletions(-)
 delete mode 100755 tools/server/chat-llama2.sh
 delete mode 100644 tools/server/chat.mjs
 delete mode 100755 tools/server/chat.sh

diff --git a/tools/server/chat-llama2.sh b/tools/server/chat-llama2.sh
deleted file mode 100755
index 450445f17e3..00000000000
--- a/tools/server/chat-llama2.sh
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/usr/bin/env bash
-
-API_URL="${API_URL:-http://127.0.0.1:8080}"
-
-CHAT=(
-    "Hello, Assistant."
-    "Hello. How may I help you today?"
-)
-
-INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
-
-trim() {
-    shopt -s extglob
-    set -- "${1##+([[:space:]])}"
-    printf "%s" "${1%%+([[:space:]])}"
-}
-
-trim_trailing() {
-    shopt -s extglob
-    printf "%s" "${1%%+([[:space:]])}"
-}
-
-format_prompt() {
-    if [[ "${#CHAT[@]}" -eq 0 ]]; then
-        echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
-    else
-        LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
-        echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
-    fi
-}
-
-tokenize() {
-    curl \
-        --silent \
-        --request POST \
-        --url "${API_URL}/tokenize" \
-        --header "Content-Type: application/json" \
-        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
-    | jq '.tokens[]'
-}
-
-N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
-
-chat_completion() {
-    PROMPT="$(trim_trailing "$(format_prompt "$1")")"
-    DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
-        prompt: .,
-        temperature: 0.2,
-        top_k: 40,
-        top_p: 0.9,
-        n_keep: $n_keep,
-        n_predict: 1024,
-        stop: ["[INST]"],
-        stream: true
-    }')"
-
-    # Create a temporary file to hold the Python output
-    TEMPFILE=$(mktemp)
-
-    exec 3< <(curl \
-        --silent \
-        --no-buffer \
-        --request POST \
-        --url "${API_URL}/completion" \
-        --header "Content-Type: application/json" \
-        --data-raw "${DATA}")
-
-    python -c "
-import json
-import sys
-
-answer = ''
-while True:
-    line = sys.stdin.readline()
-    if not line:
-        break
-    if line.startswith('data: '):
-        json_content = line[6:].strip()
-        content = json.loads(json_content)['content']
-        sys.stdout.write(content)
-        sys.stdout.flush()
-        answer += content
-
-answer = answer.rstrip('\n')
-
-# Write the answer to the temporary file
-with open('$TEMPFILE', 'w') as f:
-    f.write(answer)
-    " <&3
-
-    exec 3<&-
-
-    # Read the answer from the temporary file
-    ANSWER=$(cat $TEMPFILE)
-
-    # Clean up the temporary file
-    rm $TEMPFILE
-
-    printf "\n"
-
-    CHAT+=("$1" "$(trim "$ANSWER")")
-}
-
-while true; do
-    echo -en "\033[0;32m"  # Green color
-    read -r -e -p "> " QUESTION
-    echo -en "\033[0m"  # Reset color
-    chat_completion "${QUESTION}"
-done
diff --git a/tools/server/chat.mjs b/tools/server/chat.mjs
deleted file mode 100644
index 4fef5655a89..00000000000
--- a/tools/server/chat.mjs
+++ /dev/null
@@ -1,131 +0,0 @@
-import * as readline from 'node:readline'
-import { stdin, stdout } from 'node:process'
-import { readFileSync } from 'node:fs'
-import { SchemaConverter }  from './public_legacy/json-schema-to-grammar.mjs'
-
-const args = process.argv.slice(2);
-const grammarJsonSchemaFile = args.find(
-    (_, index) => args[index - 1] === "--grammar-json-schema"
-);
-
-const no_cached_prompt = args.find(
-    (_, index) => args[index - 1] === "--no-cache-prompt"
-) ?? "false";
-
-const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
-
-// Example usage: function,arguments
-const grammarJsonSchemaPropOrder = args.find(
-    (_, index) => args[index - 1] === "--grammar-json-schema-prop-order"
-);
-const propOrder = grammarJsonSchemaPropOrder
-    ? grammarJsonSchemaPropOrder
-          .split(",")
-          .reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {})
-    : {};
-
-let grammar = null
-if (grammarJsonSchemaFile) {
-    let schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
-    const converter = new SchemaConverter({prop_order: propOrder, allow_fetch: true})
-    schema = await converter.resolveRefs(schema, grammarJsonSchemaFile)
-    converter.visit(schema, '')
-    grammar = converter.formatGrammar()
-}
-if (grammarFile) {
-    grammar = readFileSync(grammarFile, 'utf-8')
-}
-
-// for cached prompt
-let slot_id = -1;
-
-const API_URL = 'http://127.0.0.1:8080'
-
-const chat = [
-    {
-        human: "Hello, Assistant.",
-        assistant: "Hello. How may I help you today?"
-    },
-    {
-        human: "Please tell me the largest city in Europe.",
-        assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia."
-    },
-]
-
-const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.`
-
-function format_prompt(question) {
-    return `${instruction}\n${
-        chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n")
-    }\n### Human: ${question}\n### Assistant:`
-}
-
-async function tokenize(content) {
-    const result = await fetch(`${API_URL}/tokenize`, {
-        method: 'POST',
-        body: JSON.stringify({ content })
-    })
-
-    if (!result.ok) {
-        return []
-    }
-
-    return await result.json().tokens
-}
-
-const n_keep = await tokenize(instruction).length
-
-async function chat_completion(question) {
-    const result = await fetch(`${API_URL}/completion`, {
-        method: 'POST',
-        body: JSON.stringify({
-            prompt: format_prompt(question),
-            temperature: 0.2,
-            top_k: 40,
-            top_p: 0.9,
-            n_keep: n_keep,
-            n_predict: 256,
-            cache_prompt: no_cached_prompt === "false",
-            slot_id: slot_id,
-            stop: ["\n### Human:"], // stop completion after generating this
-            grammar,
-            stream: true,
-        })
-    })
-
-    if (!result.ok) {
-        return
-    }
-
-    let answer = ''
-
-    for await (var chunk of result.body) {
-        const t = Buffer.from(chunk).toString('utf8')
-        if (t.startsWith('data: ')) {
-            const message = JSON.parse(t.substring(6))
-            slot_id = message.slot_id
-            answer += message.content
-            process.stdout.write(message.content)
-            if (message.stop) {
-                if (message.truncated) {
-                    chat.shift()
-                }
-                break
-            }
-        }
-    }
-
-    process.stdout.write('\n')
-    chat.push({ human: question, assistant: answer.trimStart() })
-}
-
-const rl = readline.createInterface({ input: stdin, output: stdout });
-
-const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => {
-    rl.question(query, options, resolve)
-});
-
-while(true) {
-    const question = await readlineQuestion(rl, '> ')
-    await chat_completion(question)
-}
diff --git a/tools/server/chat.sh b/tools/server/chat.sh
deleted file mode 100755
index 84cea2d56a0..00000000000
--- a/tools/server/chat.sh
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/usr/bin/env bash
-
-API_URL="${API_URL:-http://127.0.0.1:8080}"
-
-CHAT=(
-    "Hello, Assistant."
-    "Hello. How may I help you today?"
-    "Please tell me the largest city in Europe."
-    "Sure. The largest city in Europe is Moscow, the capital of Russia."
-)
-
-INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
-
-trim() {
-    shopt -s extglob
-    set -- "${1##+([[:space:]])}"
-    printf "%s" "${1%%+([[:space:]])}"
-}
-
-trim_trailing() {
-    shopt -s extglob
-    printf "%s" "${1%%+([[:space:]])}"
-}
-
-format_prompt() {
-    echo -n "${INSTRUCTION}"
-    printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1"
-}
-
-tokenize() {
-    curl \
-        --silent \
-        --request POST \
-        --url "${API_URL}/tokenize" \
-        --header "Content-Type: application/json" \
-        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
-    | jq '.tokens[]'
-}
-
-N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l)
-
-chat_completion() {
-    PROMPT="$(trim_trailing "$(format_prompt "$1")")"
-    DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
-        prompt: .,
-        temperature: 0.2,
-        top_k: 40,
-        top_p: 0.9,
-        n_keep: $n_keep,
-        n_predict: 256,
-        cache_prompt: true,
-        stop: ["\n### Human:"],
-        stream: true
-    }')"
-
-    ANSWER=''
-
-    while IFS= read -r LINE; do
-        if [[ $LINE = data:* ]]; then
-            CONTENT="$(echo "${LINE:5}" | jq -r '.content')"
-            printf "%s" "${CONTENT}"
-            ANSWER+="${CONTENT}"
-        fi
-    done < <(curl \
-        --silent \
-        --no-buffer \
-        --request POST \
-        --url "${API_URL}/completion" \
-        --header "Content-Type: application/json" \
-        --data-raw "${DATA}")
-
-    printf "\n"
-
-    CHAT+=("$1" "$(trim "$ANSWER")")
-}
-
-while true; do
-    read -r -e -p "> " QUESTION
-    chat_completion "${QUESTION}"
-done

From 764f1e64a1a5c8fadbe5c2c5cb167dbe9ccd0464 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Fri, 29 May 2026 19:55:14 +0200
Subject: [PATCH 34/50] graph : ensure DS32 kq_mask_lid is F32 (#23864)

---
 src/llama-graph.cpp | 10 +++++++---
 src/llama-graph.h   |  8 ++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 5bca8230b9b..e6ec3054daf 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -2656,14 +2656,18 @@ llm_graph_input_attn_k_dsa * llm_graph_context::build_attn_inp_k_dsa() const {
         inp->self_k_idxs_mla = mctx_cur->get_mla()->build_input_k_idxs(ctx0, ubatch);
 
         inp->self_kq_mask_mla = build_attn_inp_kq_mask(ctx0, mctx_cur->get_mla(), ubatch, cparams);
-        inp->self_kq_mask_mla_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_mla, GGML_TYPE_F16) : inp->self_kq_mask_mla;
+        inp->self_kq_mask_mla_cnv = inp->self_kq_mask_mla;
     }
 
     {
         inp->self_k_idxs_lid = mctx_cur->get_lid()->build_input_k_idxs(ctx0, ubatch);
 
-        inp->self_kq_mask_lid = build_attn_inp_kq_mask(ctx0, mctx_cur->get_lid(), ubatch, cparams);
-        inp->self_kq_mask_lid_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_lid, GGML_TYPE_F16) : inp->self_kq_mask_lid;
+        // ensure F32 mask
+        auto cparams_copy = cparams;
+        cparams_copy.flash_attn = false;
+
+        inp->self_kq_mask_lid = build_attn_inp_kq_mask(ctx0, mctx_cur->get_lid(), ubatch, cparams_copy);
+        inp->self_kq_mask_lid_cnv = inp->self_kq_mask_lid;
 
         inp->self_k_rot_lid = mctx_cur->get_lid()->build_input_k_rot(ctx0);
     }
diff --git a/src/llama-graph.h b/src/llama-graph.h
index d07a084a8d6..eab82bd0d70 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -399,10 +399,10 @@ class llm_graph_input_attn_k_dsa : public llm_graph_input_i {
     ggml_tensor * self_k_idxs_mla = nullptr; // I64 [n_batch]
     ggml_tensor * self_k_idxs_lid = nullptr; // I64 [n_batch]
 
-    ggml_tensor * self_kq_mask_mla     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_mla_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_lid     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_lid_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_mla     = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_mla_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_lid     = nullptr; // F32     [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_lid_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
 
     ggml_tensor * self_k_rot_lid = nullptr;
 

From 2084434e666c5b08cd5e2a2f256e583a0f85a44c Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Fri, 29 May 2026 20:25:43 +0200
Subject: [PATCH 35/50] vocab : support tokenizer for LFM2.5-8B-A1B (#23826)

* vocab: Support tokenizer for LFM2.5-8B-A1B

* Keep liquid6 tokenizer in models
---
 conversion/base.py           | 5 ++++-
 convert_hf_to_gguf_update.py | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/conversion/base.py b/conversion/base.py
index 44b2c964f4b..866625a8045 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -1447,6 +1447,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4":
             # ref: https://huggingface.co/evilfreelancer/ruGPT3XL
             res = "gpt-2"
+        if chkhsh == "9e454714343b69b99b71795c1d27a68c2a1d15dab111f4d353109f966af29da7":
+            # ref: https://huggingface.co/LiquidAI/LFM2.5-8B-A1B
+            res = "lfm2"
         if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
             # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
             res = "llama-bpe"
@@ -1598,7 +1601,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
             # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
             res = "midm-2.0"
         if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
-            # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
+            # ref: https://huggingface.co/LiquidAI/LFM2.5-350M
             res = "lfm2"
         if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
             # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 66aa1cb2fc0..827af277b92 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -139,7 +139,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "seed-coder",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
     {"name": "a.x-4.0",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
     {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
-    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
+    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2.5-350M", },
     {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
     {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
     {"name": "modern-bert",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/answerdotai/ModernBERT-base", },
@@ -183,6 +183,8 @@ class TOKENIZER_TYPE(IntEnum):
     # jina-v2-de variants
     {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
     {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/evilfreelancer/ruGPT3XL", "chkhsh": "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4"},
+    # lfm2 variants
+    {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2.5-8B-A1B", "chkhsh": "9e454714343b69b99b71795c1d27a68c2a1d15dab111f4d353109f966af29da7"},
 ]
 
 

From 22d66b567eef11cf2e9832f04db64ee0323a0fd0 Mon Sep 17 00:00:00 2001
From: ValdikSS <iam@valdikss.org.ru>
Date: Fri, 29 May 2026 22:41:35 +0300
Subject: [PATCH 36/50] ui: handle audio/vnd.wave as audio WAV file (#23754)

Firefox on Linux uses this MIME type
---
 tools/ui/src/lib/enums/files.enums.ts     | 1 +
 tools/ui/src/lib/services/chat.service.ts | 1 +
 tools/ui/src/lib/utils/file-type.ts       | 1 +
 3 files changed, 3 insertions(+)

diff --git a/tools/ui/src/lib/enums/files.enums.ts b/tools/ui/src/lib/enums/files.enums.ts
index 2f583d52eae..8008a1040b2 100644
--- a/tools/ui/src/lib/enums/files.enums.ts
+++ b/tools/ui/src/lib/enums/files.enums.ts
@@ -186,6 +186,7 @@ export enum MimeTypeAudio {
 	WAVE = 'audio/wave',
 	X_WAV = 'audio/x-wav',
 	X_WAVE = 'audio/x-wave',
+	VND_WAVE = 'audio/vnd.wave',
 	X_PN_WAV = 'audio/x-pn-wav',
 	WEBM = 'audio/webm',
 	WEBM_OPUS = 'audio/webm;codecs=opus'
diff --git a/tools/ui/src/lib/services/chat.service.ts b/tools/ui/src/lib/services/chat.service.ts
index 3c9ca74796d..d6c7e36d70e 100644
--- a/tools/ui/src/lib/services/chat.service.ts
+++ b/tools/ui/src/lib/services/chat.service.ts
@@ -40,6 +40,7 @@ function getAudioInputFormat(mimeType: string): AudioInputFormat {
 		normalizedMimeType === MimeTypeAudio.WAVE ||
 		normalizedMimeType === MimeTypeAudio.X_WAV ||
 		normalizedMimeType === MimeTypeAudio.X_WAVE ||
+		normalizedMimeType === MimeTypeAudio.VND_WAVE ||
 		normalizedMimeType === MimeTypeAudio.X_PN_WAV
 	) {
 		return FileTypeAudio.WAV;
diff --git a/tools/ui/src/lib/utils/file-type.ts b/tools/ui/src/lib/utils/file-type.ts
index 7495163d15d..d14efbc3505 100644
--- a/tools/ui/src/lib/utils/file-type.ts
+++ b/tools/ui/src/lib/utils/file-type.ts
@@ -40,6 +40,7 @@ export function getFileTypeCategory(mimeType: string): FileTypeCategory | null {
 		case MimeTypeAudio.WAVE:
 		case MimeTypeAudio.X_WAV:
 		case MimeTypeAudio.X_WAVE:
+		case MimeTypeAudio.VND_WAVE:
 		case MimeTypeAudio.X_PN_WAV:
 		case MimeTypeAudio.WEBM:
 		case MimeTypeAudio.WEBM_OPUS:

From 5a46b46acd9c7adde910f4639dedd9a7d4484c63 Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Fri, 29 May 2026 23:02:40 +0200
Subject: [PATCH 37/50] app: add llama update self updater (#23865)

* wip: llama update POC

* cleaning: llama update

* llama-gen-docs

* app: delegate llama update to the install script

* app: spawn the installer detached so llama update can replace a running binary

* cleaning: inline llama update into llama.cpp, drop app-update.{cpp,h}

* app: make llama_update static

Address review from @angt
---
 app/llama.cpp              | 14 ++++++++++++++
 tools/cli/README.md        |  4 ++--
 tools/completion/README.md |  4 ++--
 tools/server/README.md     | 10 ++++++----
 4 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/app/llama.cpp b/app/llama.cpp
index d898bfdfb24..30b09f9ef7e 100644
--- a/app/llama.cpp
+++ b/app/llama.cpp
@@ -20,6 +20,18 @@ int llama_fit_params(int argc, char ** argv);
 int llama_quantize(int argc, char ** argv);
 int llama_perplexity(int argc, char ** argv);
 
+// hands the update over to the install script, which downloads and swaps the binary
+static int llama_update(int argc, char ** argv) {
+    (void) argc;
+    (void) argv;
+
+#if defined(_WIN32)
+    return system("powershell -NoProfile -ExecutionPolicy Bypass -Command \"irm https://llama.app/install.ps1 | iex\"");
+#else
+    return system("curl -fsSL https://llama.app/install.sh | sh");
+#endif
+}
+
 static const char * progname;
 
 static int help(int argc, char ** argv);
@@ -37,6 +49,7 @@ struct command {
 static const command cmds[] = {
     {"serve",         "HTTP API server",                                    {"server"},   false, llama_server       },
     {"cli",           "Command-line interactive interface",                 {"client"},   false, llama_cli          },
+    {"update",        "Update llama to the latest release",                 {},           false, llama_update       },
     {"completion",    "Text completion",                                    {"complete"}, true,  llama_completion   },
     {"bench",         "Benchmark prompt processing and text generation",    {},           true,  llama_bench        },
     {"batched-bench", "Benchmark batched decoding performance",             {},           true,  llama_batched_bench},
@@ -94,6 +107,7 @@ static bool matches(const std::string & arg, const command & cmd) {
 
 int main(int argc, char ** argv) {
     progname = argv[0];
+
     const std::string arg = argc >= 2 ? argv[1] : "help";
 
     for (const auto & cmd : cmds) {
diff --git a/tools/cli/README.md b/tools/cli/README.md
index f34417a835d..b11aa45ce95 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -170,8 +170,8 @@
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 | `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
diff --git a/tools/completion/README.md b/tools/completion/README.md
index bcaae18f376..d90f8174866 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -253,8 +253,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 
diff --git a/tools/server/README.md b/tools/server/README.md
index 87600d9be2f..df30ca64649 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -175,6 +175,8 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-np, --parallel N` | number of server slots (default: -1, -1 = auto)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
+| `-tk, --talker-model FILE` | path to the qwen3-omni talker gguf, enables the /v1/audio/speech endpoint<br/>(env: LLAMA_ARG_TALKER_MODEL) |
+| `-c2w, --code2wav-model FILE` | path to the qwen3-omni code2wav gguf, the talker code detokenizer<br/>(env: LLAMA_ARG_CODE2WAV_MODEL) |
 | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
 | `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
 | `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
@@ -200,11 +202,11 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
-| `--api-key-file FNAME` | path to file containing API keys, one per line (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
+| `--api-key-file FNAME` | path to file containing API keys (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
 | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
-| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
+| `-to, --timeout N` | server read/write timeout in seconds (default: 3600)<br/>(env: LLAMA_ARG_TIMEOUT) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)<br/>(env: LLAMA_ARG_CACHE_PROMPT) |
 | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
@@ -222,8 +224,8 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |

From 689a9a470e5d96a853731b2accd463475e5e9a19 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Fri, 29 May 2026 23:09:47 +0200
Subject: [PATCH 38/50] server-bench : add speed-bench for speculative decoding
 benchmarking (#23869)

* spec: add speed-bench support for benchmarking

* speed-bench : add trailing newline to requirements.txt

* speed-bench : bump datasets to 4.8.0 to fix ty check

* server-bench : remove now-unused type: ignore after datasets bump
---
 docs/speculative.md                           |   5 +
 requirements/requirements-server-bench.txt    |   2 +-
 scripts/server-bench.py                       |   2 +-
 tools/server/bench/speed-bench/README.md      | 117 +++++
 .../server/bench/speed-bench/requirements.txt |   3 +
 tools/server/bench/speed-bench/speed_bench.py | 432 ++++++++++++++++++
 .../bench/speed-bench/speed_bench_compare.py  |  84 ++++
 7 files changed, 643 insertions(+), 2 deletions(-)
 create mode 100644 tools/server/bench/speed-bench/README.md
 create mode 100644 tools/server/bench/speed-bench/requirements.txt
 create mode 100644 tools/server/bench/speed-bench/speed_bench.py
 create mode 100644 tools/server/bench/speed-bench/speed_bench_compare.py

diff --git a/docs/speculative.md b/docs/speculative.md
index 041ff58038d..43d18185891 100644
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -323,3 +323,8 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts
 - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
 - `#acc tokens`: number of tokens accepted by the main model
 - `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
+
+## Benchmarking
+
+To measure the end-to-end effect of speculative decoding (throughput, latency, and draft acceptance) across diverse prompts, see the SPEED-Bench client in [tools/server/bench/speed-bench](../tools/server/bench/speed-bench/README.md).
+It runs against a running `llama-server` and can compare a baseline run against a speculative-decoding run.
diff --git a/requirements/requirements-server-bench.txt b/requirements/requirements-server-bench.txt
index ea5849fa104..fb3b0d2664b 100644
--- a/requirements/requirements-server-bench.txt
+++ b/requirements/requirements-server-bench.txt
@@ -1,4 +1,4 @@
-datasets~=3.2.0
+datasets~=4.8.0
 matplotlib~=3.10.0
 numpy~=1.26.4
 requests~=2.32.3
diff --git a/scripts/server-bench.py b/scripts/server-bench.py
index 1b557a495a5..2eabb3bce85 100755
--- a/scripts/server-bench.py
+++ b/scripts/server-bench.py
@@ -25,7 +25,7 @@ def get_prompts_text(dataset_name: str, n_prompts: int) -> Optional[list[str]]:
     ret = []
     if dataset_name.lower() == "mmlu":
         logger.info("Loading MMLU dataset...")
-        ret = datasets.load_dataset("cais/mmlu", "all")["test"]["question"]  # type: ignore
+        ret = datasets.load_dataset("cais/mmlu", "all")["test"]["question"]
     else:
         return None
     if n_prompts >= 0:
diff --git a/tools/server/bench/speed-bench/README.md b/tools/server/bench/speed-bench/README.md
new file mode 100644
index 00000000000..8d3fcd804c4
--- /dev/null
+++ b/tools/server/bench/speed-bench/README.md
@@ -0,0 +1,117 @@
+# SPEED-Bench server benchmark
+
+A lightweight [SPEED-Bench](https://huggingface.co/datasets/nvidia/SPEED-Bench) client for benchmarking an already-running `llama-server` through its OpenAI-compatible API. It is primarily meant to evaluate speculative decoding (draft model, n-gram, MTP, EAGLE3, ...) by reporting per-category throughput, latency, and draft acceptance.
+
+The dataset handling follows the [aiperf SPEED-Bench tutorial](https://github.com/ai-dynamo/aiperf/blob/main/docs/tutorials/speed-bench.md), which also documents the dataset layout in more detail.
+
+## Install
+
+```bash
+pip install -r tools/server/bench/speed-bench/requirements.txt
+```
+
+## Start a server
+
+The client does not launch the server, so start `llama-server` yourself first. If you care about throughput numbers, set the client `--concurrency` to the server's slot count (`--np`):
+
+```bash
+llama-server \
+  -m target.gguf \
+  -c 8192 \
+  --port 8080 \
+  -ngl 99 -fa on \
+  --np 1 \
+  --jinja
+```
+
+For speculative decoding, start the server with the appropriate flags for your setup (e.g. a draft model with `-md`, or `--spec-type ngram-mod`). See the [speculative decoding doc](../../../../docs/speculative.md) for details.
+
+## Run
+
+```bash
+python tools/server/bench/speed-bench/speed_bench.py \
+  --url localhost:8080 \
+  --bench qualitative \
+  --category coding \
+  --osl 1024 \
+  --concurrency 1
+```
+
+## Options
+
+| Option | Default | Description |
+| --- | --- | --- |
+| `--url` | `localhost:8080` | Server URL. The scheme and `/v1` are optional and a trailing slash is fine, so `localhost:8080` and `http://localhost:8080/v1/` both work. |
+| `--model` | none | Optional `model` field sent in each request. |
+| `--bench` | `qualitative` | SPEED-Bench config, e.g. `qualitative`, `throughput_1k`. See [available dataset variants](https://github.com/ai-dynamo/aiperf/blob/main/docs/tutorials/speed-bench.md#available-dataset-variants). |
+| `--category` | `all` | Category filter within the bench; comma-separated list or `all`. For `qualitative` the categories are `coding`, `humanities`, `math`, `multilingual`, `qa`, `rag`, `reasoning`, `roleplay`, `stem`, `summarization`, `writing`. For the `throughput_{ISL}` splits they are `high_entropy`, `low_entropy`, `mixed`. |
+| `--osl` | `1024` | Output sequence length, mapped to `max_tokens`. |
+| `--extra-inputs` | `{"temperature":0}` | Extra request fields as a JSON object. |
+| `--concurrency` | `1` | Concurrent client requests; usually match `--np`. |
+| `--limit` | none | Max samples per category (handy for smoke tests). |
+| `--timeout` | `600` | Per-request timeout in seconds. |
+| `--output` | none | Save raw per-request results and the summary to JSON. |
+
+A few common ones:
+
+- `--category all` runs every category in the bench.
+- `--category coding,math` runs just those two.
+- `--bench throughput_8k` runs a fixed-input-length throughput split.
+- `--limit 8` keeps at most 8 samples per category, which is enough for a quick check.
+
+The `throughput_{ISL}` splits use fixed input lengths (1k - 32k), so they are handy for long-context testing and for comparing different `llama-server` batching settings (e.g. sweeping `-ub` / `--ubatch-size`) on prompts of a known size. Make sure the server `-c` is large enough for the chosen split. When raising `-ub`, also raise `-b` to at least the same value, since the physical ubatch cannot exceed the logical batch.
+
+When `--output` is given, the JSON file holds the run `config`, the `selected_samples` / `completed_samples` / `failed_samples` counts, the per-category `summary` rows, and the per-sample `results`.
+
+## Metrics
+
+The summary prints one row per category plus an `overall` row:
+
+- `samples` - how many samples finished successfully.
+- `avg_prompt_t/s` - prefill throughput from llama.cpp (`timings.prompt_per_second`), averaged over the category's samples.
+- `avg_pred_t/s` - decode throughput from llama.cpp (`timings.predicted_per_second`), averaged over the category's samples.
+- `avg_latency` - average end-to-end request latency seen by the client.
+- `accept_rate` - `accepted / draft_n` over the category, or `n/a` if nothing was drafted (`draft_n == 0`).
+
+## Baseline vs speculative decoding
+
+Save a run from each server with `--output`, then diff the two JSON files with `speed_bench_compare.py`.
+
+First, start a plain `llama-server` (no speculative decoding) and save a baseline:
+
+```bash
+python tools/server/bench/speed-bench/speed_bench.py \
+  --url localhost:8080 \
+  --bench qualitative \
+  --category all \
+  --osl 1024 \
+  --concurrency 1 \
+  --output baseline.json
+```
+
+Then restart `llama-server` with speculative decoding enabled and save another run:
+
+```bash
+python tools/server/bench/speed-bench/speed_bench.py \
+  --url localhost:8080 \
+  --bench qualitative \
+  --category all \
+  --osl 1024 \
+  --concurrency 1 \
+  --output spec.json
+```
+
+Finally compare the two:
+
+```bash
+python tools/server/bench/speed-bench/speed_bench_compare.py \
+  --baseline baseline.json \
+  --speculative spec.json
+```
+
+The comparison table adds:
+
+- `decode_speedup = spec_avg_pred_t/s / base_avg_pred_t/s`
+- `latency_speedup = base_avg_latency / spec_avg_latency`
+
+Keep `--bench`, `--category`, `--osl`, and `--limit` the same across both runs, otherwise they won't be using the same prompts.
diff --git a/tools/server/bench/speed-bench/requirements.txt b/tools/server/bench/speed-bench/requirements.txt
new file mode 100644
index 00000000000..a524c2f5193
--- /dev/null
+++ b/tools/server/bench/speed-bench/requirements.txt
@@ -0,0 +1,3 @@
+datasets
+requests
+tqdm
diff --git a/tools/server/bench/speed-bench/speed_bench.py b/tools/server/bench/speed-bench/speed_bench.py
new file mode 100644
index 00000000000..adb378a6bf0
--- /dev/null
+++ b/tools/server/bench/speed-bench/speed_bench.py
@@ -0,0 +1,432 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import concurrent.futures
+import json
+import statistics
+import sys
+import time
+from dataclasses import asdict, dataclass
+from typing import Any
+from urllib.parse import urlparse
+
+import requests
+from datasets import get_dataset_config_names, load_dataset
+from tqdm import tqdm
+
+
+DATASET_REPO = "nvidia/SPEED-Bench"
+
+@dataclass
+class Sample:
+    id: str
+    category: str
+    turns: list[str]
+
+
+@dataclass
+class RequestResult:
+    id: str
+    category: str
+    ok: bool
+    turns: int
+    latency_s: float
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+    finish_reason: str | None
+    draft_n: int
+    draft_n_accepted: int
+    prompt_ms: float | None
+    predicted_ms: float | None
+    prompt_per_second: float | None
+    predicted_per_second: float | None
+    error: str | None
+
+
+def normalize_base_url(url: str) -> str:
+    url = url.strip().rstrip("/")
+    if not url:
+        raise ValueError("--url cannot be empty")
+    if "://" not in url:
+        url = "http://" + url
+    parsed = urlparse(url)
+    if not parsed.scheme or not parsed.netloc:
+        raise ValueError(f"invalid --url: {url}")
+    if not parsed.path.rstrip("/").endswith("/v1"):
+        url = url + "/v1"
+    return url.rstrip("/")
+
+
+def parse_extra_inputs(value: str) -> dict[str, Any]:
+    extra = json.loads(value)
+    if not isinstance(extra, dict):
+        raise ValueError("--extra-inputs must be a JSON object")
+    return extra
+
+
+def extract_turns(row: dict[str, Any]) -> list[str]:
+    turns = row.get("turns")
+    if isinstance(turns, list) and turns:
+        clean_turns = [str(turn).strip() for turn in turns if turn and str(turn).strip()]
+        if clean_turns:
+            return clean_turns
+    raise ValueError("missing or empty turns")
+
+
+def load_samples(args: argparse.Namespace) -> list[Sample]:
+    bench_names = get_dataset_config_names(DATASET_REPO)
+    if args.bench not in bench_names:
+        raise ValueError(
+            f"unknown --bench {args.bench!r}; available benches: {', '.join(bench_names)}"
+        )
+
+    dataset = load_dataset(DATASET_REPO, name=args.bench, split="test")
+    categories = list(dict.fromkeys(str(category) for category in dataset["category"]))
+    requested_categories = None
+    if args.category != "all":
+        requested_list = [category.strip() for category in args.category.split(",") if category.strip()]
+        if not requested_list:
+            raise ValueError(
+                f"--category must be 'all' or a comma-separated list; available categories: {', '.join(categories)}"
+            )
+        requested_categories = set(requested_list)
+        unknown_categories = [category for category in requested_list if category not in categories]
+        if unknown_categories:
+            unknown = ", ".join(unknown_categories)
+            raise ValueError(
+                f"unknown --category {unknown!r} for bench {args.bench!r}; "
+                f"available categories: all, {', '.join(categories)}"
+            )
+
+    samples: list[Sample] = []
+    samples_per_category: dict[str, int] = {}
+    skipped = 0
+    for index, row_raw in enumerate(dataset):
+        row = dict(row_raw)
+        category_raw = row.get("category")
+        if not isinstance(category_raw, str) or not category_raw.strip():
+            skipped += 1
+            continue
+        category = category_raw.strip()
+        if requested_categories is not None and category not in requested_categories:
+            continue
+        if args.limit is not None and samples_per_category.get(category, 0) >= args.limit:
+            continue
+
+        try:
+            turns = extract_turns(row)
+        except ValueError:
+            skipped += 1
+            continue
+        question_id = row.get("question_id")
+        if not isinstance(question_id, str) or not question_id.strip():
+            skipped += 1
+            continue
+        sample_id = question_id.strip()
+        samples.append(Sample(id=sample_id, category=category, turns=turns))
+        samples_per_category[category] = samples_per_category.get(category, 0) + 1
+
+    if not samples:
+        raise RuntimeError(f"no samples selected from bench={args.bench} category={args.category}")
+
+    if skipped:
+        print(f"speed_bench: skipped {skipped} rows without usable turns")
+    return samples
+
+
+def parse_completion_response(data: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any], str | None, str]:
+    usage = data.get("usage") or {}
+    timings = data.get("timings") or {}
+    finish_reason = None
+    content = ""
+    choices = data.get("choices")
+    if isinstance(choices, list) and choices and isinstance(choices[0], dict):
+        choice = choices[0]
+        finish_reason = choice.get("finish_reason")
+        message = choice.get("message")
+        if isinstance(message, dict) and isinstance(message.get("content"), str):
+            content = message["content"]
+        elif isinstance(choice.get("text"), str):
+            content = choice["text"]
+    return usage, timings, finish_reason, content
+
+
+def run_request(
+    endpoint: str,
+    model: str | None,
+    messages: list[dict[str, str]],
+    osl: int,
+    extra_inputs: dict[str, Any],
+    timeout: float,
+) -> tuple[dict[str, Any], float]:
+    payload: dict[str, Any] = {
+        "messages": messages,
+        "max_tokens": osl,
+        "stream": False,
+    }
+    if model:
+        payload["model"] = model
+    payload.update(extra_inputs)
+    payload["max_tokens"] = osl
+
+    start = time.perf_counter()
+    response = requests.post(endpoint, json=payload, timeout=timeout)
+    latency_s = time.perf_counter() - start
+    if response.status_code != 200:
+        body = response.text[:500].replace("\n", "\\n")
+        raise RuntimeError(f"HTTP {response.status_code}: {body}")
+    return response.json(), latency_s
+
+
+def run_one(
+    sample: Sample,
+    endpoint: str,
+    model: str | None,
+    osl: int,
+    extra_inputs: dict[str, Any],
+    timeout: float,
+) -> RequestResult:
+    selected_turns = sample.turns
+    messages: list[dict[str, str]] = []
+    total_latency_s = 0.0
+    prompt_tokens = 0
+    completion_tokens = 0
+    total_tokens = 0
+    draft_n = 0
+    draft_n_accepted = 0
+    prompt_ms = 0.0
+    predicted_ms = 0.0
+    prompt_per_second = None
+    predicted_per_second = None
+    finish_reason: str | None = None
+    try:
+        for turn in selected_turns:
+            messages.append({"role": "user", "content": turn})
+            data, latency_s = run_request(endpoint, model, messages, osl, extra_inputs, timeout)
+            total_latency_s += latency_s
+            usage, timings, finish_reason, assistant_text = parse_completion_response(data)
+
+            turn_prompt_tokens = int(usage.get("prompt_tokens") or timings.get("prompt_n") or 0)
+            turn_completion_tokens_count = int(usage.get("completion_tokens") or timings.get("predicted_n") or 0)
+            turn_total_tokens_count = int(usage.get("total_tokens") or (turn_prompt_tokens + turn_completion_tokens_count))
+            prompt_tokens += turn_prompt_tokens
+            completion_tokens += turn_completion_tokens_count
+            total_tokens += turn_total_tokens_count
+            draft_n += int(timings.get("draft_n") or 0)
+            draft_n_accepted += int(timings.get("draft_n_accepted") or 0)
+            prompt_ms += float(timings.get("prompt_ms") or 0)
+            predicted_ms += float(timings.get("predicted_ms") or 0)
+            if len(selected_turns) == 1 and isinstance(timings.get("prompt_per_second"), (int, float)):
+                prompt_per_second = float(timings["prompt_per_second"])
+            if len(selected_turns) == 1 and isinstance(timings.get("predicted_per_second"), (int, float)):
+                predicted_per_second = float(timings["predicted_per_second"])
+
+            messages.append({"role": "assistant", "content": assistant_text})
+
+        if total_tokens == 0:
+            total_tokens = prompt_tokens + completion_tokens
+        if len(selected_turns) > 1:
+            prompt_per_second = (prompt_tokens / (prompt_ms / 1000)) if prompt_ms > 0 else None
+            predicted_per_second = (completion_tokens / (predicted_ms / 1000)) if predicted_ms > 0 else None
+
+        return RequestResult(
+            id=sample.id,
+            category=sample.category,
+            ok=True,
+            turns=len(selected_turns),
+            latency_s=total_latency_s,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            finish_reason=finish_reason,
+            draft_n=draft_n,
+            draft_n_accepted=draft_n_accepted,
+            prompt_ms=prompt_ms if prompt_ms > 0 else None,
+            predicted_ms=predicted_ms if predicted_ms > 0 else None,
+            prompt_per_second=prompt_per_second,
+            predicted_per_second=predicted_per_second,
+            error=None,
+        )
+    except Exception as exc:
+        return RequestResult(
+            id=sample.id,
+            category=sample.category,
+            ok=False,
+            turns=len(selected_turns),
+            latency_s=total_latency_s,
+            prompt_tokens=0,
+            completion_tokens=0,
+            total_tokens=0,
+            finish_reason=None,
+            draft_n=0,
+            draft_n_accepted=0,
+            prompt_ms=None,
+            predicted_ms=None,
+            prompt_per_second=None,
+            predicted_per_second=None,
+            error=str(exc),
+        )
+
+
+def summarize_group(category: str, results: list[RequestResult]) -> dict[str, Any]:
+    ok_results = [result for result in results if result.ok]
+    latencies = [result.latency_s for result in ok_results]
+    server_prompt_speeds = [
+        result.prompt_per_second
+        for result in ok_results
+        if result.prompt_per_second is not None
+    ]
+    server_completion_speeds = [
+        result.predicted_per_second
+        for result in ok_results
+        if result.predicted_per_second is not None
+    ]
+    turns = sum(result.turns for result in ok_results)
+    draft_n = sum(result.draft_n for result in ok_results)
+    accepted = sum(result.draft_n_accepted for result in ok_results)
+
+    return {
+        "category": category,
+        "requests": len(ok_results),
+        "turns": turns,
+        "failed": len(results) - len(ok_results),
+        "avg_prompt_t_s": statistics.mean(server_prompt_speeds) if server_prompt_speeds else None,
+        "avg_pred_t_s": statistics.mean(server_completion_speeds) if server_completion_speeds else None,
+        "avg_latency": statistics.mean(latencies) if latencies else None,
+        "draft_n": draft_n,
+        "accepted": accepted,
+        "accept_rate": (accepted / draft_n) if draft_n > 0 else None,
+    }
+
+
+def fmt_value(value: Any, kind: str = "") -> str:
+    if value is None:
+        return "n/a"
+    if kind == "int":
+        return str(int(value))
+    if kind == "rate":
+        return f"{float(value):.4f}"
+    if kind == "seconds":
+        return f"{float(value):.3f}s"
+    if kind == "speed":
+        return f"{float(value):.2f}"
+    if kind == "speedup":
+        return f"{float(value):.2f}x"
+    return str(value)
+
+
+def print_table(rows: list[dict[str, Any]]) -> None:
+    columns = [
+        ("category", "category", ""),
+        ("samples", "requests", "int"),
+        ("avg_prompt_t/s", "avg_prompt_t_s", "speed"),
+        ("avg_pred_t/s", "avg_pred_t_s", "speed"),
+        ("avg_latency", "avg_latency", "seconds"),
+        ("accept_rate", "accept_rate", "rate"),
+    ]
+    print_rows(rows, columns)
+
+
+def print_rows(rows: list[dict[str, Any]], columns: list[tuple[str, str, str]]) -> None:
+    rendered_rows = []
+    for row in rows:
+        rendered_rows.append([fmt_value(row.get(key), kind) for _, key, kind in columns])
+
+    widths = [len(header) for header, _, _ in columns]
+    for rendered in rendered_rows:
+        for i, cell in enumerate(rendered):
+            widths[i] = max(widths[i], len(cell))
+
+    header = "  ".join(header.ljust(widths[i]) for i, (header, _, _) in enumerate(columns))
+    print(header)
+    print("  ".join("-" * width for width in widths))
+    for rendered in rendered_rows:
+        print("  ".join(cell.ljust(widths[i]) for i, cell in enumerate(rendered)))
+
+
+def save_output(path: str, args: argparse.Namespace, samples: list[Sample], results: list[RequestResult], summary: list[dict[str, Any]]) -> None:
+    payload = {
+        "config": {
+            "url": args.url,
+            "model": args.model,
+            "bench": args.bench,
+            "category": args.category,
+            "osl": args.osl,
+            "concurrency": args.concurrency,
+            "extra_inputs": args.extra_inputs,
+        },
+        "selected_samples": len(samples),
+        "completed_samples": sum(1 for result in results if result.ok),
+        "failed_samples": sum(1 for result in results if not result.ok),
+        "summary": summary,
+        "results": [asdict(result) for result in results],
+    }
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(payload, f, indent=2, sort_keys=True)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Run SPEED-Bench against an OpenAI-compatible llama-server.")
+    parser.add_argument("--url", default="localhost:8080", help="Server URL, for example localhost:8080 or http://localhost:8080/v1")
+    parser.add_argument("--model", default=None, help="Optional model name to send in OpenAI requests")
+    parser.add_argument("--bench", default="qualitative", help="SPEED-Bench config to run, for example qualitative or throughput_1k")
+    parser.add_argument("--category", default="all", help="Category to run within the selected bench; use all for no category filter")
+    parser.add_argument("--osl", type=int, default=4096, help="Output sequence length, mapped to max_tokens")
+    parser.add_argument("--extra-inputs", default='{"temperature":0}', help="Extra request fields as a JSON object")
+    parser.add_argument("--concurrency", type=int, default=1, help="Concurrent client requests; usually match llama-server --np")
+    parser.add_argument("--limit", type=int, default=None, help="Optional sample limit per category for smoke tests")
+    parser.add_argument("--timeout", type=float, default=600, help="Per-request timeout in seconds")
+    parser.add_argument("--output", default=None, help="Optional path to save raw results JSON")
+    args = parser.parse_args(argv)
+    try:
+        base_url = normalize_base_url(args.url)
+        endpoint = base_url + "/chat/completions"
+        extra_inputs = parse_extra_inputs(args.extra_inputs)
+        args.extra_inputs = extra_inputs
+        samples = load_samples(args)
+    except Exception as exc:
+        print(f"speed_bench: setup failed: {exc}", file=sys.stderr)
+        return 2
+
+    print(f"speed_bench: loaded {len(samples)} samples from bench={args.bench} category={args.category}")
+
+    results: list[RequestResult] = []
+    started = time.perf_counter()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as executor:
+        futures = [
+            executor.submit(run_one, sample, endpoint, args.model, args.osl, extra_inputs, args.timeout)
+            for sample in samples
+        ]
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="speed_bench", unit="sample"):
+            result = future.result()
+            results.append(result)
+
+    elapsed = time.perf_counter() - started
+    categories = list(dict.fromkeys(sample.category for sample in samples))
+    summary = [
+        summarize_group(category, [result for result in results if result.category == category])
+        for category in categories
+    ]
+    summary.append(summarize_group("overall", results))
+    print()
+    print(f"Summary (elapsed={elapsed:.2f}s)")
+    print_table(summary)
+
+    if args.output:
+        save_output(args.output, args, samples, results, summary)
+        print(f"\nspeed_bench: wrote {args.output}")
+
+    failed = sum(1 for result in results if not result.ok)
+    if failed:
+        print(f"\nspeed_bench: {failed} samples failed", file=sys.stderr)
+        first_error = next((result.error for result in results if result.error), None)
+        if first_error:
+            print(f"first error: {first_error}", file=sys.stderr)
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/server/bench/speed-bench/speed_bench_compare.py b/tools/server/bench/speed-bench/speed_bench_compare.py
new file mode 100644
index 00000000000..070ab57db5d
--- /dev/null
+++ b/tools/server/bench/speed-bench/speed_bench_compare.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from typing import Any
+
+from speed_bench import fmt_value, print_rows
+
+
+def load_summary(path: str) -> list[dict[str, Any]]:
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    summary = data.get("summary")
+    if not isinstance(summary, list):
+        raise ValueError(f"{path} does not contain a summary list")
+    return summary
+
+
+def compare_rows(baseline: list[dict[str, Any]], speculative: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    baseline_by_category = {row["category"]: row for row in baseline}
+    comparisons = []
+    for row in speculative:
+        base = baseline_by_category.get(row["category"])
+        if not base:
+            continue
+        base_speed = base.get("avg_pred_t_s")
+        spec_speed = row.get("avg_pred_t_s")
+        base_latency = base.get("avg_latency")
+        spec_latency = row.get("avg_latency")
+        comparisons.append(
+            {
+                "category": row["category"],
+                "base_avg_pred_t_s": base_speed,
+                "spec_avg_pred_t_s": spec_speed,
+                "decode_speedup": (spec_speed / base_speed) if base_speed and spec_speed else None,
+                "base_avg_latency": base_latency,
+                "spec_avg_latency": spec_latency,
+                "latency_speedup": (base_latency / spec_latency) if base_latency and spec_latency else None,
+                "accept_rate": row.get("accept_rate"),
+            }
+        )
+    return comparisons
+
+
+def print_comparison(rows: list[dict[str, Any]]) -> None:
+    if not rows:
+        print("No overlapping categories found for comparison.")
+        return
+    columns = [
+        ("category", "category", ""),
+        ("base_avg_pred_t/s", "base_avg_pred_t_s", "speed"),
+        ("spec_avg_pred_t/s", "spec_avg_pred_t_s", "speed"),
+        ("decode_speedup", "decode_speedup", "speedup"),
+        ("base_avg_latency", "base_avg_latency", "seconds"),
+        ("spec_avg_latency", "spec_avg_latency", "seconds"),
+        ("latency_speedup", "latency_speedup", "speedup"),
+        ("accept_rate", "accept_rate", "rate"),
+    ]
+    print_rows(rows, columns)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Compare two SPEED-Bench runs (baseline vs speculative).")
+    parser.add_argument("--baseline", required=True, help="Baseline results JSON produced by speed_bench.py --output")
+    parser.add_argument("--speculative", required=True, help="Speculative decoding results JSON produced by speed_bench.py --output")
+    args = parser.parse_args(argv)
+
+    try:
+        baseline = load_summary(args.baseline)
+        speculative = load_summary(args.speculative)
+    except Exception as exc:
+        print(f"speed_bench_compare: failed to load inputs: {exc}", file=sys.stderr)
+        return 2
+
+    comparisons = compare_rows(baseline, speculative)
+    print(f"Comparison: baseline={args.baseline} speculative={args.speculative}")
+    print_comparison(comparisons)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From b22da25889cf71bf9654a8d04468e0506bb087f8 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Fri, 29 May 2026 14:14:11 -0700
Subject: [PATCH 39/50] ggml-webgpu: add q4_0/q8_0 SET_ROWS (#23760)

* Add q8_0 and q4_0 set_rows

* Add fast(er) quantization set_rows path

* formatting/naming

* a little more naming

* Remove unused constant

* Don't override other override

* Avoid bitcast

* Narrow relaxation
---
 .../ggml-webgpu/ggml-webgpu-shader-lib.hpp    |  90 ++++---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          |  11 +-
 .../ggml-webgpu/wgsl-shaders/set_rows.wgsl    |   5 +-
 .../wgsl-shaders/set_rows_quant.wgsl          | 224 ++++++++++++++++++
 tests/test-backend-ops.cpp                    |   9 +
 5 files changed, 298 insertions(+), 41 deletions(-)
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
index 60e98a60741..f4c5eca0df5 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -84,16 +84,16 @@ struct ggml_webgpu_shader_lib_context {
     ggml_tensor * src5;
     ggml_tensor * dst;
 
-    uint32_t max_wg_size;
-    size_t   wg_mem_limit_bytes       = 0;
-    bool     supports_subgroups       = false;
-    bool     supports_subgroup_matrix = false;
-    uint32_t sg_mat_m                 = 0;
-    uint32_t sg_mat_n                 = 0;
-    uint32_t sg_mat_k                 = 0;
-    uint32_t min_subgroup_size        = 0;
-    uint32_t max_subgroup_size        = 0;
-    bool     supports_dot_product     = false;
+    uint32_t    max_wg_size;
+    size_t      wg_mem_limit_bytes       = 0;
+    bool        supports_subgroups       = false;
+    bool        supports_subgroup_matrix = false;
+    uint32_t    sg_mat_m                 = 0;
+    uint32_t    sg_mat_n                 = 0;
+    uint32_t    sg_mat_k                 = 0;
+    uint32_t    min_subgroup_size        = 0;
+    uint32_t    max_subgroup_size        = 0;
+    bool        supports_dot_product     = false;
     std::string vendor;
 };
 
@@ -166,9 +166,11 @@ struct ggml_webgpu_set_rows_pipeline_key {
     int dst_type;
     int vec4;
     int i64_idx;
+    int pair_blocks;
 
     bool operator==(const ggml_webgpu_set_rows_pipeline_key & other) const {
-        return dst_type == other.dst_type && vec4 == other.vec4 && i64_idx == other.i64_idx;
+        return dst_type == other.dst_type && vec4 == other.vec4 && i64_idx == other.i64_idx &&
+               pair_blocks == other.pair_blocks;
     }
 };
 
@@ -178,6 +180,7 @@ struct ggml_webgpu_set_rows_pipeline_key_hash {
         ggml_webgpu_hash_combine(seed, key.dst_type);
         ggml_webgpu_hash_combine(seed, key.vec4);
         ggml_webgpu_hash_combine(seed, key.i64_idx);
+        ggml_webgpu_hash_combine(seed, key.pair_blocks);
         return seed;
     }
 };
@@ -185,6 +188,7 @@ struct ggml_webgpu_set_rows_pipeline_key_hash {
 struct ggml_webgpu_set_rows_shader_decisions {
     bool     vec4;
     bool     i64_idx;
+    bool     pair_blocks;
     uint32_t wg_size;
 };
 
@@ -772,31 +776,30 @@ inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(
                                   (v_offset_elems % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0u);
     const bool kv_vec_type_supported =
         K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0;
-    const uint32_t kv_vec_head_align = K->type == GGML_TYPE_F16 ? GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH :
-                                                                  (uint32_t) ggml_blck_size(K->type);
-    const bool kv_vec_head_dims_aligned = context.src0->ne[0] % kv_vec_head_align == 0 &&
-                                          context.src2->ne[0] % kv_vec_head_align == 0;
+    const uint32_t kv_vec_head_align =
+        K->type == GGML_TYPE_F16 ? GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH : (uint32_t) ggml_blck_size(K->type);
+    const bool kv_vec_head_dims_aligned =
+        context.src0->ne[0] % kv_vec_head_align == 0 && context.src2->ne[0] % kv_vec_head_align == 0;
     // Compile with enough invocations to cover the largest reported subgroup.
-    const bool use_vec = context.supports_subgroups && (context.src0->ne[1] < 20) &&
-                         kv_vec_head_dims_aligned && kv_vec_type_supported &&
-                         (K->type != GGML_TYPE_F16 || f16_vec4_aligned) &&
+    const bool use_vec = context.supports_subgroups && (context.src0->ne[1] < 20) && kv_vec_head_dims_aligned &&
+                         kv_vec_type_supported && (K->type != GGML_TYPE_F16 || f16_vec4_aligned) &&
                          (context.src2->type == K->type);
     const bool tile_can_dispatch_all_q_rows =
         context.max_subgroup_size > 0 &&
         context.max_wg_size >= GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE * context.max_subgroup_size;
-    const bool use_subgroup_matrix =
-        context.supports_subgroup_matrix && context.sg_mat_k > 0 && context.sg_mat_n > 0 &&
-        context.src0->ne[0] % context.sg_mat_k == 0 && context.src2->ne[0] % context.sg_mat_n == 0;
+    const bool use_subgroup_matrix = context.supports_subgroup_matrix && context.sg_mat_k > 0 && context.sg_mat_n > 0 &&
+                                     context.src0->ne[0] % context.sg_mat_k == 0 &&
+                                     context.src2->ne[0] % context.sg_mat_n == 0;
     const bool use_tile = context.supports_subgroups && !use_subgroup_matrix && K->type == GGML_TYPE_F16 &&
                           V->type == GGML_TYPE_F16 && f16_vec4_aligned &&
                           (context.src0->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
                           (context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
                           tile_can_dispatch_all_q_rows && !use_vec;
 
-    decisions.path = use_vec                          ? GGML_WEBGPU_FLASH_ATTN_PATH_VEC :
-                     use_tile                         ? GGML_WEBGPU_FLASH_ATTN_PATH_TILE :
-                     use_subgroup_matrix              ? GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX :
-                                                        GGML_WEBGPU_FLASH_ATTN_PATH_NONE;
+    decisions.path = use_vec             ? GGML_WEBGPU_FLASH_ATTN_PATH_VEC :
+                     use_tile            ? GGML_WEBGPU_FLASH_ATTN_PATH_TILE :
+                     use_subgroup_matrix ? GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX :
+                                           GGML_WEBGPU_FLASH_ATTN_PATH_NONE;
 
     if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_NONE) {
         return decisions;
@@ -1131,9 +1134,9 @@ class ggml_webgpu_shader_lib {
                        ggml_webgpu_flash_attn_blk_pipeline_key_hash>
         flash_attn_blk_pipelines;
     std::unordered_map<ggml_webgpu_mul_mat_vec_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_vec_pipeline_key_hash>
-        mul_mat_vec_pipelines;     // fast mat-vec (n==1)
+        mul_mat_vec_pipelines;   // fast mat-vec (n==1)
     std::unordered_map<ggml_webgpu_mul_mat_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_pipeline_key_hash>
-                                             mul_mat_fast_pipelines;       // fast mat-mat (reg-tile or subgroup)
+        mul_mat_fast_pipelines;  // fast mat-mat (reg-tile or subgroup)
     std::unordered_map<ggml_webgpu_quantize_q8_pipeline_key, webgpu_pipeline, ggml_webgpu_quantize_q8_pipeline_key_hash>
                                              quantize_q8_pipelines;
     std::unordered_map<int, webgpu_pipeline> mul_mat_id_gather_pipelines;  // key is fixed
@@ -1264,10 +1267,13 @@ class ggml_webgpu_shader_lib {
     }
 
     webgpu_pipeline get_set_rows_pipeline(const ggml_webgpu_shader_lib_context & context) {
-        ggml_webgpu_set_rows_pipeline_key key = {};
-        key.dst_type                          = context.dst->type;
-        key.vec4                              = context.src0->ne[0] % 4 == 0;
-        key.i64_idx                           = context.src1->type == GGML_TYPE_I64;
+        const bool                        quantized = ggml_is_quantized(context.dst->type);
+        ggml_webgpu_set_rows_pipeline_key key       = {};
+        key.dst_type                                = context.dst->type;
+        key.vec4 =
+            (context.dst->type == GGML_TYPE_F32 || context.dst->type == GGML_TYPE_F16) && context.src0->ne[0] % 4 == 0;
+        key.i64_idx     = context.src1->type == GGML_TYPE_I64;
+        key.pair_blocks = quantized && ((context.src0->ne[0] / ggml_blck_size(context.dst->type)) % 2 == 0);
 
         auto it = set_rows_pipelines.find(key);
         if (it != set_rows_pipelines.end()) {
@@ -1286,6 +1292,14 @@ class ggml_webgpu_shader_lib {
                 defines.push_back("DST_F16");
                 variant += "_dstf16";
                 break;
+            case GGML_TYPE_Q8_0:
+                defines.push_back("DST_Q8_0");
+                variant += "_dstq8_0";
+                break;
+            case GGML_TYPE_Q4_0:
+                defines.push_back("DST_Q4_0");
+                variant += "_dstq4_0";
+                break;
             default:
                 GGML_ABORT("Unsupported dst type for set_rows shader");
         }
@@ -1298,13 +1312,19 @@ class ggml_webgpu_shader_lib {
             defines.push_back("I64_IDX");
             variant += "_i64idx";
         }
+        if (key.pair_blocks) {
+            defines.push_back("PAIR_BLOCKS");
+            variant += "_pair_blocks";
+        }
 
         defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
 
-        auto processed                  = preprocessor.preprocess(wgsl_set_rows, defines);
-        auto decisions                  = std::make_shared<ggml_webgpu_set_rows_shader_decisions>();
+        const auto & shader_source      = quantized ? wgsl_set_rows_quant : wgsl_set_rows;
+        auto         processed          = preprocessor.preprocess(shader_source, defines);
+        auto         decisions          = std::make_shared<ggml_webgpu_set_rows_shader_decisions>();
         decisions->vec4                 = key.vec4;
         decisions->i64_idx              = key.i64_idx;
+        decisions->pair_blocks          = key.pair_blocks;
         decisions->wg_size              = context.max_wg_size;
         set_rows_pipelines[key]         = ggml_webgpu_create_pipeline(device, processed, variant);
         set_rows_pipelines[key].context = decisions;
@@ -1660,7 +1680,7 @@ class ggml_webgpu_shader_lib {
         key.type                              = context.dst->type;
         key.d_state                           = (int) context.src0->ne[0];
         key.xbc_overlap                       = ggml_webgpu_tensor_overlap(context.src1, context.src4) &&
-                                                ggml_webgpu_tensor_overlap(context.src1, context.src5);
+                          ggml_webgpu_tensor_overlap(context.src1, context.src5);
 
         auto it = ssm_scan_pipelines.find(key);
         if (it != ssm_scan_pipelines.end()) {
@@ -1819,7 +1839,7 @@ class ggml_webgpu_shader_lib {
                           (context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
                                                        1 :
                                                        0;
-        key.use_mmvq                             =
+        key.use_mmvq =
             ggml_webgpu_can_use_mmvq(context.src0, context.src1, context.supports_dot_product, context.vendor);
 
         auto it = mul_mat_vec_pipelines.find(key);
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 1846886db4e..1a99f1cb52f 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1331,7 +1331,11 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_set_rows(webgpu_context & ct
     }
 
     uint32_t threads;
-    if (decisions->vec4) {
+    if (ggml_is_quantized(dst->type)) {
+        const uint32_t blocks_per_row = src->ne[0] / ggml_blck_size(dst->type);
+        threads =
+            (src->ne[1] * src->ne[2] * src->ne[3]) * (decisions->pair_blocks ? (blocks_per_row / 2) : blocks_per_row);
+    } else if (decisions->vec4) {
         threads = (src->ne[1] * src->ne[2] * src->ne[3]) * (src->ne[0] / 4);
     } else {
         threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3];
@@ -4046,8 +4050,9 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                           (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32);
             break;
         case GGML_OP_SET_ROWS:
-            supports_op = ((op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32) && src0->type == GGML_TYPE_F32 &&
-                           (src1->type == GGML_TYPE_I64 || src1->type == GGML_TYPE_I32));
+            supports_op = ((op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_Q8_0 ||
+                            op->type == GGML_TYPE_Q4_0) &&
+                           src0->type == GGML_TYPE_F32 && (src1->type == GGML_TYPE_I64 || src1->type == GGML_TYPE_I32));
             break;
         case GGML_OP_GET_ROWS:
             if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_webgpu_supported_qtype(src0->type)) {
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
index 99e9192c71a..09f2f0eddb3 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
@@ -71,7 +71,6 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
         return;
     }
 
-    // getting the row from gid
     let elems_per_row = params.ne0 / VEC_SIZE;
     var i = gid.x / elems_per_row;
 
@@ -104,6 +103,6 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let i_dst_row = params.offset_dst + idx_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3;
     let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3;
 
-    let col_idx = (gid.x % elems_per_row);
-    dst[i_dst_row/VEC_SIZE + col_idx] = DST_TYPE(src[i_src_row/VEC_SIZE + col_idx]);
+    let col_idx = gid.x % elems_per_row;
+    dst[i_dst_row / VEC_SIZE + col_idx] = DST_TYPE(src[i_src_row / VEC_SIZE + col_idx]);
 }
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl
new file mode 100644
index 00000000000..876e65b6ae1
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl
@@ -0,0 +1,224 @@
+#ifdef DST_Q8_0
+#define BLOCK_SIZE 32u
+#define BLOCK_BYTES 34u
+#define QS_WORDS 8u
+#elif defined(DST_Q4_0)
+#define BLOCK_SIZE 32u
+#define BLOCK_BYTES 18u
+#define QS_WORDS 4u
+#endif
+
+@group(0) @binding(0)
+var<storage, read_write> src: array<f32>;
+
+@group(0) @binding(1)
+var<storage, read_write> idx: array<u32>;
+
+@group(0) @binding(2)
+#ifdef PAIR_BLOCKS
+var<storage, read_write> dst: array<u32>;
+#else
+var<storage, read_write> dst: array<atomic<u32>>;
+#endif
+
+#ifdef I64_IDX
+@group(0) @binding(3)
+var<storage, read_write> error: atomic<u32>;
+#define PARAMS_BINDING 4
+#else
+#define PARAMS_BINDING 3
+#endif
+
+struct Params {
+    offset_src: u32, // in elements
+    offset_idx: u32, // in elements
+    offset_dst: u32, // in blocks
+
+    // Strides (in elements / blocks)
+    stride_src1: u32,
+    stride_src2: u32,
+    stride_src3: u32,
+
+    stride_idx0: u32,
+    stride_idx1: u32,
+    stride_idx2: u32,
+
+    stride_dst1: u32,
+    stride_dst2: u32,
+    stride_dst3: u32,
+
+    // Shape of src
+    ne0: u32,
+    n_rows: u32,
+    ne2: u32,
+    ne3: u32,
+
+    // Shape of idx
+    idx1: u32,
+    idx2: u32,
+};
+
+@group(0) @binding(PARAMS_BINDING)
+var<uniform> params: Params;
+
+// if the quantization type is unaligned and there are an odd number of blocks per row, we need to store atomically
+#ifndef PAIR_BLOCKS
+fn merge_store_dst_word(word_idx: u32, mask: u32, bits: u32) {
+    loop {
+        let old = atomicLoad(&dst[word_idx]);
+        let merged = (old & ~mask) | (bits & mask);
+        let result = atomicCompareExchangeWeak(&dst[word_idx], old, merged);
+        if (result.exchanged) {
+            return;
+        }
+    }
+}
+#else
+fn merge_store_dst_word(word_idx: u32, mask: u32, bits: u32) {
+    let old = dst[word_idx];
+    dst[word_idx] = (old & ~mask) | (bits & mask);
+}
+#endif
+
+fn store_u16(dst_word_idx: u32, block_byte_offset: u32, byte_offset: u32, value: u32) {
+    let total_byte_offset = block_byte_offset + byte_offset;
+    let word_idx = dst_word_idx + total_byte_offset / 4u;
+    let shift = (total_byte_offset & 2u) * 8u;
+    let mask = 0xFFFFu << shift;
+    merge_store_dst_word(word_idx, mask, (value & 0xFFFFu) << shift);
+}
+
+fn store_u32(dst_word_idx: u32, block_byte_offset: u32, byte_offset: u32, value: u32) {
+    let total_byte_offset = block_byte_offset + byte_offset;
+    let word_idx = dst_word_idx + total_byte_offset / 4u;
+    let shift = (total_byte_offset & 3u) * 8u;
+
+    if (shift == 0u) {
+#ifdef PAIR_BLOCKS
+        dst[word_idx] = value;
+#else
+        atomicStore(&dst[word_idx], value);
+#endif
+        return;
+    }
+
+    let lo_mask = 0xFFFFFFFFu << shift;
+    let hi_mask = (1u << shift) - 1u;
+    merge_store_dst_word(word_idx, lo_mask, value << shift);
+    merge_store_dst_word(word_idx + 1u, hi_mask, value >> (32u - shift));
+}
+
+fn quantize_block_params(src_block: u32) -> vec2<f32> {
+#ifdef DST_Q8_0
+    var amax = 0.0;
+    for (var j: u32 = 0u; j < BLOCK_SIZE; j++) {
+        amax = max(amax, abs(src[src_block + j]));
+    }
+
+    let d = amax / 127.0;
+    let id = select(0.0, 1.0 / d, d > 0.0);
+    return vec2(d, id);
+#elif defined(DST_Q4_0)
+    var amax = 0.0;
+    var max_val = 0.0;
+    for (var j: u32 = 0u; j < BLOCK_SIZE; j++) {
+        let v = src[src_block + j];
+        let av = abs(v);
+        if (amax < av) {
+            amax = av;
+            max_val = v;
+        }
+    }
+
+    let d = max_val / -8.0;
+    let id = select(0.0, 1.0 / d, d != 0.0);
+    return vec2(d, id);
+#endif
+}
+
+fn quantize_block_word(src_block: u32, j: u32, id: f32) -> u32 {
+#ifdef DST_Q8_0
+    let base = src_block + j * 4u;
+    return (u32(i32(round(src[base + 0u] * id)) & 0xFF) << 0u) |
+           (u32(i32(round(src[base + 1u] * id)) & 0xFF) << 8u) |
+           (u32(i32(round(src[base + 2u] * id)) & 0xFF) << 16u) |
+           (u32(i32(round(src[base + 3u] * id)) & 0xFF) << 24u);
+#elif defined(DST_Q4_0)
+    var packed_q = 0u;
+    for (var k: u32 = 0u; k < 4u; k++) {
+        let x0 = src[src_block + j * 4u + k] * id;
+        let x1 = src[src_block + 16u + j * 4u + k] * id;
+        let q0 = u32(clamp(i32(x0 + 8.5), 0, 15));
+        let q1 = u32(clamp(i32(x1 + 8.5), 0, 15));
+        packed_q |= (q0 & 0xFu) << (8u * k);
+        packed_q |= (q1 & 0xFu) << (8u * k + 4u);
+    }
+    return packed_q;
+#endif
+}
+
+fn quantize_block(src_block: u32, dst_word_idx: u32, block_byte_offset: u32) {
+    let params = quantize_block_params(src_block);
+    let d = params.x;
+    let id = params.y;
+    let packed_d = pack2x16float(vec2(d, 0.0)) & 0xFFFFu;
+    store_u16(dst_word_idx, block_byte_offset, 0u, packed_d);
+
+    for (var j: u32 = 0u; j < QS_WORDS; j++) {
+        store_u32(dst_word_idx, block_byte_offset, 2u + j * 4u, quantize_block_word(src_block, j, id));
+    }
+}
+
+@compute @workgroup_size(WG_SIZE)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let blocks_per_row = params.ne0 / BLOCK_SIZE;
+#ifdef PAIR_BLOCKS
+    let blocks_per_invocation = 2u;
+#else
+    let blocks_per_invocation = 1u;
+#endif
+    let invocations_per_row = blocks_per_row / blocks_per_invocation;
+    let total_invocations = params.ne3 * params.ne2 * params.n_rows * invocations_per_row;
+    if (gid.x >= total_invocations) {
+        return;
+    }
+
+    var i = gid.x / invocations_per_row;
+    let block_in_row = (gid.x % invocations_per_row) * blocks_per_invocation;
+
+    let i_src3 = i / (params.ne2 * params.n_rows);
+    i = i % (params.ne2 * params.n_rows);
+    let i_src2 = i / params.n_rows;
+    let i_src1 = i % params.n_rows;
+
+    let i_idx2 = i_src3 % params.idx2;
+    let i_idx1 = i_src2 % params.idx1;
+    let i_idx0 = i_src1;
+
+#ifdef I64_IDX
+    let idx_high = (params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2) * 2u;
+    let idx_val = idx[idx_high];
+    let idx_low_val = idx[idx_high + 1u];
+
+    if (idx_low_val != 0u) {
+        atomicStore(&error, 1u);
+        return;
+    }
+#else
+    let idx_i = params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2;
+    let idx_val = idx[idx_i];
+#endif
+
+    let dst_row_blocks = params.offset_dst + idx_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3;
+    let src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3;
+    let src_block = src_row + block_in_row * BLOCK_SIZE;
+    let dst_block_byte = (dst_row_blocks + block_in_row) * BLOCK_BYTES;
+
+    let dst_word_idx = dst_block_byte / 4u;
+#ifdef PAIR_BLOCKS
+    quantize_block(src_block, dst_word_idx, 0u);
+    quantize_block(src_block + BLOCK_SIZE, dst_word_idx, BLOCK_BYTES);
+#else
+    quantize_block(src_block, dst_word_idx, dst_block_byte & 3u);
+#endif
+}
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 19f8558d897..0176599459f 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2415,6 +2415,15 @@ struct test_set_rows : public test_case {
         }
         return 1e-7;
     }
+
+    // See dicussion here: https://github.com/ggml-org/llama.cpp/pull/23760#issuecomment-4566312209
+    double max_nmse_err(ggml_backend_t backend) override {
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend));
+        if (type == GGML_TYPE_Q8_0 && strcmp(ggml_backend_reg_name(reg), "WebGPU") == 0) {
+            return std::max(test_case::max_nmse_err(backend), 2e-7);
+        }
+        return test_case::max_nmse_err(backend);
+    }
 };
 
 // GGML_OP_ROPE + GGML_OP_VIEW + GGML_OP_SET_ROWS

From 151f3a98e9a4944d505fa0c429fd51c6c8530eaf Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Fri, 29 May 2026 14:16:05 -0700
Subject: [PATCH 40/50] ggml-webgpu: Check earlier for WebGPU required features
 (#23879)

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 1a99f1cb52f..d577b5afa3c 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -3724,7 +3724,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) {
     ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants);
 }
 
-static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
+static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
     wgpu::RequestAdapterOptions options = {};
 
 #ifndef __EMSCRIPTEN__
@@ -3762,10 +3762,6 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
     ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size();
     ctx->webgpu_global_ctx->max_inflight_batches      = ggml_backend_webgpu_get_max_inflight_batches();
     ctx->webgpu_global_ctx->vendor                    = info.vendor;
-    wgpu::SupportedFeatures features;
-    ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
-    // we require f16 support
-    GGML_ASSERT(ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
     ctx->webgpu_global_ctx->capabilities.supports_subgroups =
         ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::Subgroups);
     // for dot4I8packed
@@ -3877,7 +3873,6 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
         "device_desc: %s\n",
         info.vendorID, std::string(info.vendor).c_str(), std::string(info.architecture).c_str(), info.deviceID,
         std::string(info.device).c_str(), std::string(info.description).c_str());
-    return true;
 }
 
 static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
@@ -4507,7 +4502,12 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
             UINT64_MAX);
     }
 
-    if (adapter != nullptr) {
+    // WebGPU backend requires f16 support and, on native, implicit device synchronization.
+    if (adapter != nullptr && adapter.HasFeature(wgpu::FeatureName::ShaderF16)
+#ifndef __EMSCRIPTEN__
+        && adapter.HasFeature(wgpu::FeatureName::ImplicitDeviceSynchronization)
+#endif
+    ) {
         ctx->device_count = 1;
     }
 
@@ -4515,8 +4515,11 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
 }
 
 ggml_backend_t ggml_backend_webgpu_init(void) {
-    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_webgpu_reg(), 0);
-
+    ggml_backend_reg_t reg = ggml_backend_webgpu_reg();
+    if (ggml_backend_reg_dev_count(reg) == 0) {
+        return nullptr;
+    }
+    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(reg, 0);
     return ggml_backend_webgpu_backend_init(dev, nullptr);
 }
 

From 0821c5fcfd729af70037bc1e9e60769d42c081ba Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Sat, 30 May 2026 00:06:29 +0200
Subject: [PATCH 41/50] server: in SSE mode, send HTTP headers when slot starts
 (#23884)

* server: in SSE mode, send HTTP headers when slot starts

* ref to pr

* stream should be false by default
---
 tools/server/server-context.cpp | 20 +++++++++++++++-----
 tools/server/server-task.cpp    |  3 +++
 tools/server/server-task.h      |  4 +++-
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index ae9e0bf60d8..bfe3443c1de 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1734,7 +1734,7 @@ struct server_context_impl {
         return true;
     }
 
-    void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) {
+    void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress, bool is_begin = false) {
         auto res = std::make_unique<server_task_result_cmpl_partial>();
 
         res->id    = slot.task->id;
@@ -1746,6 +1746,9 @@ struct server_context_impl {
             res->progress.cache     = slot.n_prompt_tokens_cache;
             res->progress.processed = slot.prompt.tokens.size();
             res->progress.time_ms   = (ggml_time_us() - slot.t_start_process_prompt) / 1000;
+        }
+        if (is_begin) {
+            res->is_begin = true;
         } else {
             res->content = tkn.text_to_send;
             res->tokens  = { tkn.tok };
@@ -2828,10 +2831,15 @@ struct server_context_impl {
 
                         slot.prompt.tokens.keep_first(n_past);
 
-                        // send initial 0% progress update if needed
                         // this is to signal the client that the request has started processing
-                        if (slot.task->params.stream && slot.task->params.return_progress) {
-                            send_partial_response(slot, {}, true);
+                        if (slot.task->params.stream) {
+                            if (slot.task->params.return_progress) {
+                                // send initial 0% progress update if needed
+                                send_partial_response(slot, {}, true);
+                            } else {
+                                // otherwise, for streaming without progress, signal HTTP to send the headers (i.e. 200 status)
+                                send_partial_response(slot, {}, false, true);
+                            }
                         }
                     }
 
@@ -3745,7 +3753,9 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
         // next responses are streamed
         // to be sent immediately
         json first_result_json = first_result->to_json();
-        if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
+        if (first_result_json == nullptr) {
+            res->data = ""; // simply send HTTP headers and status code
+        } else if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
             res->data = format_anthropic_sse(first_result_json);
         } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
             res->data = format_oai_resp_sse(first_result_json);
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index abc00c82bdb..ff80be6ccba 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -1422,6 +1422,9 @@ void server_task_result_cmpl_partial::update(task_result_state & state) {
 
 json server_task_result_cmpl_partial::to_json() {
     GGML_ASSERT(is_updated && "update() must be called before to_json()");
+    if (is_begin) {
+        return nullptr; // simply signal to HTTP handler to send the headers and status code
+    }
     switch (res_type) {
         case TASK_RESPONSE_TYPE_NONE:
             return to_json_non_oaicompat();
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index 60e216e7927..d47dc690cff 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -47,7 +47,7 @@ enum stop_type {
 };
 
 struct task_params {
-    bool stream          = true;
+    bool stream          = false;
     bool include_usage   = false;
     bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt
     bool return_tokens   = false;
@@ -418,6 +418,8 @@ struct server_task_result_cmpl_partial : server_task_result {
 
     bool post_sampling_probs;
     bool is_progress = false;
+    bool is_begin = false; // whether to send 200 status to HTTP client (begin of SSE stream)
+                           // ref: https://github.com/ggml-org/llama.cpp/pull/23884
     completion_token_output prob_output;
     result_timings timings;
     result_prompt_progress progress;

From 1738129bee5c81b06fa1850daf3f958813c76f5f Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Sat, 30 May 2026 07:48:22 +0300
Subject: [PATCH 42/50] llama : do not skip iGPU when only RPC devices are
 present (#23868)

After #23007 reclassified integrated CUDA/HIP devices as IGPU, the device
selection logic dropped the local iGPU whenever any RPC server was added,
because RPC devices made `model->devices` non-empty. On systems where the
"iGPU" is the main compute device (e.g. Strix Halo with 128 GiB of unified
memory), this caused all tensors to be allocated on the RPC peer alone and
model loading to fail.

Gate the iGPU inclusion on `gpus.empty()` instead, so RPC peers no longer
suppress the local iGPU.

closes: #23858
---
 src/llama.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index dfe30ce8f61..edacd1d5f42 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -239,8 +239,9 @@ static bool llama_prepare_model_devices(const llama_model_params & params, llama
         // add GPUs
         model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
 
-        // add integrated GPUs only if no other devices were found
-        if (model->devices.empty()) {
+        // add integrated GPUs only if no discrete GPUs were found
+        // (RPC servers do not count, otherwise the local iGPU would be dropped on iGPU+RPC setups)
+        if (gpus.empty()) {
             model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
         }
     }

From d4204b03a592a90cb62feaaa14276c9407e36cbd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 30 May 2026 08:52:30 +0300
Subject: [PATCH 43/50] ci : clear cache instead of "no timestamp" keys + fix
 macos (#23895)

* ci : ios use macos-15 again

* ci : add and test ccache-clear

* cont : fix

* cont : set permission

* cont : another permission

* cont : token

* cont : print key

* cont : bring back perms

* cont : test windows

* cont : add token

* cont : cleanup

* ci : make release jobs clean-up their ccache
---
 .github/actions/ccache-clear/action.yml  |  22 +++
 .github/workflows/build-cuda-windows.yml |  20 ++-
 .github/workflows/release.yml            | 169 ++++++++++++++++-------
 3 files changed, 160 insertions(+), 51 deletions(-)
 create mode 100644 .github/actions/ccache-clear/action.yml

diff --git a/.github/actions/ccache-clear/action.yml b/.github/actions/ccache-clear/action.yml
new file mode 100644
index 00000000000..d38587efaf8
--- /dev/null
+++ b/.github/actions/ccache-clear/action.yml
@@ -0,0 +1,22 @@
+name: "ccache-clear"
+description: "Delete all GitHub Actions caches matching a key prefix"
+inputs:
+  key:
+    description: "Cache key prefix to match and delete"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Clear caches
+      shell: bash
+      run: |
+        CACHES=$(gh cache list --key "ccache-${{ inputs.key }}" --json id,key --jq '.[] | "\(.id) \(.key)"' 2>/dev/null)
+        if [ -z "$CACHES" ]; then
+          echo "No caches found with key prefix: ${{ inputs.key }}"
+          exit 0
+        fi
+        while read -r id key; do
+          echo "Deleting cache: $id ($key)"
+          gh cache delete "$id"
+        done <<< "$CACHES"
diff --git a/.github/workflows/build-cuda-windows.yml b/.github/workflows/build-cuda-windows.yml
index 631ff4ed26b..e9e941421b6 100644
--- a/.github/workflows/build-cuda-windows.yml
+++ b/.github/workflows/build-cuda-windows.yml
@@ -13,6 +13,7 @@ concurrency:
   queue: max
 
 env:
+  GH_TOKEN: ${{ github.token }}
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
   LLAMA_ARG_LOG_COLORS: 1
@@ -23,6 +24,9 @@ jobs:
   cuda:
     runs-on: windows-2022
 
+    permissions:
+      actions: write
+
     strategy:
       matrix:
         cuda: ['12.4', '13.3']
@@ -36,7 +40,6 @@ jobs:
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
 
       - name: Install Cuda Toolkit
         uses: ./.github/actions/windows-setup-cuda
@@ -67,9 +70,17 @@ jobs:
           cmake --build build --config Release -j %NINJA_JOBS% -t ggml
           cmake --build build --config Release
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
+
   hip:
     runs-on: windows-2022
 
+    permissions:
+      actions: write
+
     env:
       # Make sure this is in sync with build-cache.yml
       HIPSDK_INSTALLER_VERSION: "26.Q1"
@@ -125,7 +136,6 @@ jobs:
           #       to populate the ccache for the release with manual runs of this workflow
           #key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
           key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
 
       - name: Build
         id: cmake_build
@@ -144,3 +154,9 @@ jobs:
             -DGPU_TARGETS="gfx1100"  `
             -DGGML_RPC=ON
           cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          #key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
+          key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 08f02af63ec..4785bbe167b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -28,6 +28,7 @@ on:
     ]
 
 env:
+  GH_TOKEN: ${{ github.token }}
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
 
@@ -83,6 +84,9 @@ jobs:
 
     runs-on: ${{ matrix.os }}
 
+    permissions:
+      actions: write
+
     steps:
       - name: Clone
         id: checkout
@@ -101,7 +105,6 @@ jobs:
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: release-${{ matrix.os }}-${{ matrix.arch }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
 
       - name: Build
         id: cmake_build
@@ -116,6 +119,11 @@ jobs:
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-${{ matrix.os }}-${{ matrix.arch }}
+
       - name: Determine tag name
         id: tag
         uses: ./.github/actions/get-tag-name
@@ -147,6 +155,9 @@ jobs:
 
     runs-on: ${{ matrix.os }}
 
+    permissions:
+      actions: write
+
     steps:
       - name: Clone
         id: checkout
@@ -161,13 +172,6 @@ jobs:
           cache: "npm"
           cache-dependency-path: "tools/ui/package-lock.json"
 
-      - name: ccache
-        if: ${{ matrix.build != 's390x' }}
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-${{ matrix.os }}-cpu
-          append-timestamp: false # note: use this only with non-concurrent jobs!
-
       - name: Dependencies
         id: depends
         run: |
@@ -181,6 +185,12 @@ jobs:
           echo "CC=gcc-14" >> "$GITHUB_ENV"
           echo "CXX=g++-14" >> "$GITHUB_ENV"
 
+      - name: ccache
+        if: ${{ matrix.build != 's390x' }}
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-${{ matrix.os }}-cpu
+
       - name: Build
         id: cmake_build
         run: |
@@ -194,6 +204,11 @@ jobs:
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(nproc)
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-${{ matrix.os }}-cpu
+
       - name: Determine tag name
         id: tag
         uses: ./.github/actions/get-tag-name
@@ -224,6 +239,9 @@ jobs:
 
     runs-on: ${{ matrix.os }}
 
+    permissions:
+      actions: write
+
     steps:
       - name: Clone
         id: checkout
@@ -238,12 +256,6 @@ jobs:
           cache: "npm"
           cache-dependency-path: "tools/ui/package-lock.json"
 
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-${{ matrix.os }}-vulkan
-          append-timestamp: false # note: use this only with non-concurrent jobs!
-
       - name: Dependencies
         id: depends
         run: |
@@ -259,6 +271,11 @@ jobs:
             echo "CXX=g++-14" >> "$GITHUB_ENV"
           fi
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-${{ matrix.os }}-vulkan
+
       - name: Build
         id: cmake_build
         run: |
@@ -272,6 +289,11 @@ jobs:
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(nproc)
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-${{ matrix.os }}-vulkan
+
       - name: Determine tag name
         id: tag
         uses: ./.github/actions/get-tag-name
@@ -294,6 +316,9 @@ jobs:
 
     runs-on: ubuntu-latest
 
+    #permissions:
+    #  actions: write
+
     env:
       NDK_VERSION: "29.0.14206865"
 
@@ -311,18 +336,6 @@ jobs:
           cache: "npm"
           cache-dependency-path: "tools/ui/package-lock.json"
 
-      # note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
-      #        for some reason, the ccache does not improve the build time in this case
-      # example:
-      #   cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
-      #   cache on:  https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
-      #
-      #- name: ccache
-      #  uses: ggml-org/ccache-action@v1.2.21
-      #  with:
-      #    key: release-android-arm64
-      #    append-timestamp: false # note: use this only with non-concurrent jobs!
-
       - name: Set up JDK
         uses: actions/setup-java@v5
         with:
@@ -339,6 +352,17 @@ jobs:
           sdkmanager "ndk;${{ env.NDK_VERSION }}"
           echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
 
+      # note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
+      #        for some reason, the ccache does not improve the build time in this case
+      # example:
+      #   cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
+      #   cache on:  https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
+      #
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@v1.2.21
+      #  with:
+      #    key: release-android-arm64
+
       - name: Build
         id: cmake_build
         run: |
@@ -357,6 +381,11 @@ jobs:
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(nproc)
 
+      #- name: ccache-clear
+      #  uses: ./.github/actions/ccache-clear
+      #  with:
+      #    key: release-android-arm64
+
       - name: Determine tag name
         id: tag
         uses: ./.github/actions/get-tag-name
@@ -379,6 +408,9 @@ jobs:
 
     runs-on: ubuntu-24.04
 
+    permissions:
+      actions: write
+
     outputs:
       openvino_version: ${{ steps.openvino_version.outputs.value }}
 
@@ -409,7 +441,6 @@ jobs:
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: release-ubuntu-24.04-openvino-release-no-preset-v1
-          append-timestamp: false # note: use this only with non-concurrent jobs!
 
       - name: Dependencies
         run: |
@@ -447,6 +478,11 @@ jobs:
             -DGGML_OPENVINO=ON
           cmake --build build/ReleaseOV --config Release -j $(nproc)
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-ubuntu-24.04-openvino-release-no-preset-v1
+
       - name: Determine tag name
         id: tag
         uses: ./.github/actions/get-tag-name
@@ -469,6 +505,9 @@ jobs:
 
     runs-on: windows-2025
 
+    permissions:
+      actions: write
+
     strategy:
       matrix:
         include:
@@ -488,15 +527,14 @@ jobs:
           cache: "npm"
           cache-dependency-path: "tools/ui/package-lock.json"
 
+      - name: Install Ninja
+        run: |
+          choco install ninja
+
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: release-windows-2025-${{ matrix.arch }}-cpu
-          append-timestamp: false # note: use this only with non-concurrent jobs!
-
-      - name: Install Ninja
-        run: |
-          choco install ninja
 
       - name: Build
         shell: cmd
@@ -512,6 +550,11 @@ jobs:
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2025-${{ matrix.arch }}-cpu
+
       - name: Pack artifacts
         id: pack_artifacts
         run: |
@@ -530,6 +573,9 @@ jobs:
 
     runs-on: windows-2025
 
+    permissions:
+      actions: write
+
     env:
       OPENBLAS_VERSION: 0.3.23
       VULKAN_VERSION: 1.4.313.2
@@ -558,12 +604,6 @@ jobs:
           cache: "npm"
           cache-dependency-path: "tools/ui/package-lock.json"
 
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
-
       - name: Install Vulkan SDK
         id: get_vulkan
         if: ${{ matrix.backend == 'vulkan' }}
@@ -578,6 +618,11 @@ jobs:
         run: |
           choco install ninja
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
+
       - name: Install OpenCL Headers and Libs
         id: install_opencl
         if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
@@ -604,6 +649,11 @@ jobs:
           cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON
           cmake --build build --config Release --target ${{ matrix.target }}
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
+
       - name: Pack artifacts
         id: pack_artifacts
         run: |
@@ -621,6 +671,9 @@ jobs:
 
     runs-on: windows-2022
 
+    permissions:
+      actions: write
+
     strategy:
       matrix:
         cuda: ['12.4', '13.3']
@@ -637,12 +690,6 @@ jobs:
           cache: "npm"
           cache-dependency-path: "tools/ui/package-lock.json"
 
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
-
       - name: Install Cuda Toolkit
         uses: ./.github/actions/windows-setup-cuda
         with:
@@ -653,6 +700,11 @@ jobs:
         run: |
           choco install ninja
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
+
       - name: Build
         id: cmake_build
         shell: cmd
@@ -669,6 +721,11 @@ jobs:
           set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
           cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
+
       - name: Pack artifacts
         id: pack_artifacts
         run: |
@@ -748,7 +805,6 @@ jobs:
 #        uses: ggml-org/ccache-action@v1.2.21
 #        with:
 #          key: release-windows-2022-x64-sycl
-#          append-timestamp: false # note: use this only with non-concurrent jobs!
 #
 #      - name: Build
 #        id: cmake_build
@@ -869,7 +925,6 @@ jobs:
 #        uses: ggml-org/ccache-action@v1.2.21
 #        with:
 #          key: release-ubuntu-24.04-sycl
-#          append-timestamp: false # note: use this only with non-concurrent jobs!
 #
 #      - name: Build
 #        id: cmake_build
@@ -908,6 +963,9 @@ jobs:
 
     runs-on: ubuntu-22.04
 
+    permissions:
+      actions: write
+
     strategy:
       matrix:
         include:
@@ -938,7 +996,6 @@ jobs:
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: release-ubuntu-22.04-rocm-${{ matrix.ROCM_VERSION }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
 
       - name: Dependencies
         id: depends
@@ -996,6 +1053,11 @@ jobs:
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(nproc)
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-ubuntu-22.04-rocm-${{ matrix.ROCM_VERSION }}
+
       - name: Determine tag name
         id: tag
         uses: ./.github/actions/get-tag-name
@@ -1021,6 +1083,9 @@ jobs:
 
     runs-on: windows-2022
 
+    permissions:
+      actions: write
+
     env:
       HIPSDK_INSTALLER_VERSION: "26.Q1"
 
@@ -1060,7 +1125,6 @@ jobs:
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
 
       - name: Install ROCm
         if: steps.cache-rocm.outputs.cache-hit != 'true'
@@ -1120,6 +1184,11 @@ jobs:
           cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
           cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
+
       - name: Pack artifacts
         id: pack_artifacts
         run: |
@@ -1134,7 +1203,9 @@ jobs:
   ios-xcode-build:
     needs: [check_release]
     if: ${{ needs.check_release.outputs.should_release == 'true' }}
-    runs-on: macos-26
+    # TODO: figure out how to make this work with macos-26
+    #       https://github.com/ggml-org/llama.cpp/actions/runs/26652714555/job/78604869474
+    runs-on: macos-15
 
     steps:
       - name: Checkout code

From 337528571d99e414ca04fddd86d448960137770f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 30 May 2026 09:21:38 +0300
Subject: [PATCH 44/50] ci : fix s390x release job (#23898)

* ci : fix s390x release job

* ci : multi-thread build for `ios-xcode`

* ocd : names
---
 .github/workflows/release.yml |  9 +++++----
 build-xcframework.sh          | 14 +++++++-------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4785bbe167b..9ca34ad8645 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -205,6 +205,7 @@ jobs:
           cmake --build build --config Release -j $(nproc)
 
       - name: ccache-clear
+        if: ${{ matrix.build != 's390x' }}
         uses: ./.github/actions/ccache-clear
         with:
           key: release-${{ matrix.os }}-cpu
@@ -1200,7 +1201,7 @@ jobs:
           path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
           name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
 
-  ios-xcode-build:
+  ios-xcode:
     needs: [check_release]
     if: ${{ needs.check_release.outputs.should_release == 'true' }}
     # TODO: figure out how to make this work with macos-26
@@ -1352,7 +1353,7 @@ jobs:
 #          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
 #          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
 
-  ui-build:
+  ui:
     needs: [check_release]
     if: ${{ needs.check_release.outputs.should_release == 'true' }}
     uses: ./.github/workflows/ui-build.yml
@@ -1380,9 +1381,9 @@ jobs:
       #- ubuntu-24-sycl
       - android-arm64
       - macos-cpu
-      - ios-xcode-build
+      - ios-xcode
       #- openEuler-cann
-      - ui-build
+      - ui
 
     outputs:
       tag_name: ${{ steps.tag.outputs.name }}
diff --git a/build-xcframework.sh b/build-xcframework.sh
index d287d72fbd8..1da7b9bda9b 100755
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -416,7 +416,7 @@ cmake -B build-ios-sim -G Xcode \
     -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
     -DLLAMA_OPENSSL=OFF \
     -S .
-cmake --build build-ios-sim --config Release -- -quiet
+cmake --build build-ios-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 echo "Building for iOS devices..."
 cmake -B build-ios-device -G Xcode \
@@ -430,7 +430,7 @@ cmake -B build-ios-device -G Xcode \
     -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
     -DLLAMA_OPENSSL=OFF \
     -S .
-cmake --build build-ios-device --config Release -- -quiet
+cmake --build build-ios-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 echo "Building for macOS..."
 cmake -B build-macos -G Xcode \
@@ -441,7 +441,7 @@ cmake -B build-macos -G Xcode \
     -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
     -DLLAMA_OPENSSL=OFF \
     -S .
-cmake --build build-macos --config Release -- -quiet
+cmake --build build-macos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 echo "Building for visionOS..."
 cmake -B build-visionos -G Xcode \
@@ -456,7 +456,7 @@ cmake -B build-visionos -G Xcode \
     -DLLAMA_OPENSSL=OFF \
     -DLLAMA_BUILD_SERVER=OFF \
     -S .
-cmake --build build-visionos --config Release -- -quiet
+cmake --build build-visionos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 echo "Building for visionOS simulator..."
 cmake -B build-visionos-sim -G Xcode \
@@ -471,7 +471,7 @@ cmake -B build-visionos-sim -G Xcode \
     -DLLAMA_OPENSSL=OFF \
     -DLLAMA_BUILD_SERVER=OFF \
     -S .
-cmake --build build-visionos-sim --config Release -- -quiet
+cmake --build build-visionos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 # Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
 echo "Building for tvOS simulator..."
@@ -487,7 +487,7 @@ cmake -B build-tvos-sim -G Xcode \
     -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
     -DLLAMA_OPENSSL=OFF \
     -S .
-cmake --build build-tvos-sim --config Release -- -quiet
+cmake --build build-tvos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 echo "Building for tvOS devices..."
 cmake -B build-tvos-device -G Xcode \
@@ -502,7 +502,7 @@ cmake -B build-tvos-device -G Xcode \
     -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
     -DLLAMA_OPENSSL=OFF \
     -S .
-cmake --build build-tvos-device --config Release -- -quiet
+cmake --build build-tvos-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 # Setup frameworks and copy binaries and headers
 echo "Setting up framework structures..."

From 6e093b80eae32b1fec18848fe53320b220de1451 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Sat, 30 May 2026 10:39:31 +0200
Subject: [PATCH 45/50] vulkan: add Flash Attention support for BFloat16 KV
 cache (#23420)

* vulkan: add flash attention bf16 kv support

* vulkan: bf16 FA coopmat1 support

* vulkan: bf16 FA coopmat2 support

* fix FA bf16 f32 fallback

* fix FA bf16 coopmat1 shader

* fix FA bf16 coopmat2 shader

* code cleanup

* cleanup comment change

* address feedback

* add O_TYPE for cm2 FA

* use O_TYPE for gqaStore function

* reduce BFLOAT16 ifdefs
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 148 +++++++++++++-----
 .../vulkan-shaders/flash_attn_base.glsl       |  12 +-
 .../vulkan-shaders/flash_attn_cm1.comp        |  98 +++++++-----
 .../vulkan-shaders/flash_attn_cm2.comp        |  36 +++--
 .../vulkan-shaders/flash_attn_dequant.glsl    |   8 +
 .../vulkan-shaders/vulkan-shaders-gen.cpp     |  22 +++
 6 files changed, 235 insertions(+), 89 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c9f906d7930..2a30fb95c61 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -691,6 +691,7 @@ struct vk_device_struct {
     uint32_t coopmat_int_k;
 
     bool coopmat2;
+    bool coopmat2_bf16_support {};
     bool coopmat2_decode_vector;
 
     bool pipeline_executable_properties_support {};
@@ -3139,7 +3140,7 @@ struct vk_fa_tuning_params {
 };
 
 static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type, ggml_type v_type);
-static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc);
+static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type = GGML_TYPE_F16);
 
 static vk_fa_tuning_params get_fa_tuning_params_scalar(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) {
 
@@ -3279,6 +3280,13 @@ static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_
     FaCodePath path = device->coopmat2 ? FA_COOPMAT2 :
                       device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR;
 
+    if (path == FA_COOPMAT2 && k_type == GGML_TYPE_BF16 && !device->coopmat2_bf16_support) {
+        path = FA_COOPMAT1;
+    }
+    if (path == FA_COOPMAT1 && k_type == GGML_TYPE_BF16 && !device->coopmat_bf16_support) {
+        path = FA_SCALAR;
+    }
+
     if (path == FA_COOPMAT1 && device->architecture == vk_device_architecture::NVIDIA_TURING) {
         // Nvidia compiler bug, see https://github.com/ggml-org/llama.cpp/pull/19075#issuecomment-3820716090
         path = FA_SCALAR;
@@ -3288,7 +3296,7 @@ static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_
         bool shape_ok = (f32acc && device->coopmat_support_16x16x16_f32acc) ||
                         (!f32acc && device->coopmat_support_16x16x16_f16acc);
         const vk_fa_tuning_params params = get_fa_tuning_params_coopmat1(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc);
-        bool shmem_ok = ggml_vk_flash_attn_coopmat_shmem_support(device, params, hsk, hsv, f32acc);
+        bool shmem_ok = ggml_vk_flash_attn_coopmat_shmem_support(device, params, hsk, hsv, f32acc, k_type);
 
         if (!shape_ok || !shmem_ok) {
             path = FA_SCALAR;
@@ -3334,8 +3342,8 @@ static vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const
 
 static std::vector<uint32_t> get_fa_spec_constants(const vk_fa_pipeline_state& state) {
     const auto fa_block_bytes = [](ggml_type t) -> uint32_t {
-        // decodeBufF32 uses a block of vec4s for a better memory access pattern.
-        return t == GGML_TYPE_F32 ? 16u : (uint32_t) ggml_type_size(t);
+        if (t == GGML_TYPE_F32) return 16u;
+        return (uint32_t) ggml_type_size(t);
     };
     return {
         /* 0 WorkGroupSize   */ state.workgroup_size,
@@ -3849,10 +3857,16 @@ static void ggml_vk_load_shaders(vk_device& device) {
         const uint32_t fa_sgs = fa.first.subgroup_size;
         const bool fa_ds = fa.first.subgroup_size == 0;
 
+        const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16;
         const bool use_mmq = ggml_vk_fa_scalar_uses_mmq(device, fa.first.k_type);
         const void * spv_data = nullptr;
         size_t spv_size = 0;
-        if (use_mmq) {
+        const char *name = nullptr;
+        if (bf16_kv) {
+            spv_data = flash_attn_f32_f16_fp32_data;
+            spv_size = flash_attn_f32_f16_fp32_len;
+            name = aligned ? "flash_attn_f32_bf16_aligned" : "flash_attn_f32_bf16";
+        } else if (use_mmq) {
 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
             if (device->fp16) {
                 if (f32acc) { spv_data = flash_attn_f32_f16_int8_data;        spv_size = flash_attn_f32_f16_int8_len; }
@@ -3862,6 +3876,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
                 spv_size = flash_attn_f32_f16_fp32_int8_len;
             }
 #endif
+            name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16";
         } else {
             if (device->fp16) {
                 if (f32acc) { spv_data = flash_attn_f32_f16_data;        spv_size = flash_attn_f32_f16_len; }
@@ -3870,8 +3885,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
                 spv_data = flash_attn_f32_f16_fp32_data;
                 spv_size = flash_attn_f32_f16_fp32_len;
             }
+            name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16";
         }
-        const char *name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16";
         ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7,
                                 sizeof(vk_flash_attn_push_constants), {Br, 1, 1},
                                 get_fa_spec_constants(fa.first), aligned ? Bc : 1, true,
@@ -3889,11 +3904,25 @@ static void ggml_vk_load_shaders(vk_device& device) {
             const uint32_t fa_sgs = fa.first.subgroup_size;
             const bool fa_ds = fa.first.subgroup_size == 0;
 
+            const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16;
+
             const void * spv_data;
             size_t spv_size;
-            if (f32acc) { spv_data = flash_attn_f32_f16_cm1_data;        spv_size = flash_attn_f32_f16_cm1_len; }
-            else        { spv_data = flash_attn_f32_f16_f16acc_cm1_data; spv_size = flash_attn_f32_f16_f16acc_cm1_len; }
-            const char *name = aligned ? "flash_attn_f32_f16_aligned_cm1" : "flash_attn_f32_f16_cm1";
+            const char *name;
+            if (bf16_kv) {
+#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
+                if (!device->coopmat_bf16_support) continue;
+                spv_data = flash_attn_f32_f16_bf16_cm1_data;
+                spv_size = flash_attn_f32_f16_bf16_cm1_len;
+                name = aligned ? "flash_attn_f32_bf16_aligned_cm1" : "flash_attn_f32_bf16_cm1";
+#else
+                continue;
+#endif
+            } else {
+                if (f32acc) { spv_data = flash_attn_f32_f16_cm1_data;        spv_size = flash_attn_f32_f16_cm1_len; }
+                else        { spv_data = flash_attn_f32_f16_f16acc_cm1_data; spv_size = flash_attn_f32_f16_f16acc_cm1_len; }
+                name = aligned ? "flash_attn_f32_f16_aligned_cm1" : "flash_attn_f32_f16_cm1";
+            }
             ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7,
                                     sizeof(vk_flash_attn_push_constants), {Br, 1, 1},
                                     get_fa_spec_constants(fa.first), aligned ? Bc : 1, true,
@@ -3911,10 +3940,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
             const bool aligned = fa.first.aligned;
             const bool f32acc = fa.first.f32acc;
 
+            const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16;
             const void * spv_data;
             size_t spv_size;
             const char * name;
-            if (aligned) {
+            if (bf16_kv) {
+#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
+                if (!device->coopmat2_bf16_support) continue;
+                spv_data = flash_attn_f32_f16_bf16_cm2_data;
+                spv_size = flash_attn_f32_f16_bf16_cm2_len;
+                name = aligned ? "flash_attn_f32_bf16_aligned_cm2" : "flash_attn_f32_bf16_cm2";
+#else
+                continue;
+#endif
+            } else if (aligned) {
                 if (f32acc) { spv_data = flash_attn_f32_f16_cm2_data;        spv_size = flash_attn_f32_f16_cm2_len;        name = "flash_attn_f32_f16_aligned_f32acc_cm2"; }
                 else        { spv_data = flash_attn_f32_f16_f16acc_cm2_data; spv_size = flash_attn_f32_f16_f16acc_cm2_len; name = "flash_attn_f32_f16_aligned_f16acc_cm2"; }
             } else {
@@ -5784,46 +5823,72 @@ static vk_device ggml_vk_get_device(size_t idx) {
                      found_fp16_256 = false,
                      found_fp32_128 = false,
                      found_fp32_256 = false;
+                bool found_bf16_128 = false,
+                     found_bf16_256 = false;
                 // need to support fp16*fp16 with fp16/fp32 accumulator, for workgroupsize 128
                 // with 32x16x16 and 256 with 32x32x16.
                 for (auto &prop : flexible_dimensions) {
                     if (prop.saturatingAccumulation == VK_FALSE &&
-                        prop.scope == VK_SCOPE_WORKGROUP_KHR &&
-                        prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
-                        prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
-
-                        if (prop.workgroupInvocations == 128 &&
-                            prop.MGranularity <= 32 &&
-                            prop.NGranularity <= 16 &&
-                            prop.KGranularity <= 16) {
-                            if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
-                                prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
-                                found_fp16_128 = true;
+                        prop.scope == VK_SCOPE_WORKGROUP_KHR) {
+
+                        if (prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
+                            prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
+
+                            if (prop.workgroupInvocations == 128 &&
+                                prop.MGranularity <= 32 &&
+                                prop.NGranularity <= 16 &&
+                                prop.KGranularity <= 16) {
+                                if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
+                                    prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
+                                    found_fp16_128 = true;
+                                }
+                                if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
+                                    prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
+                                    found_fp32_128 = true;
+                                }
                             }
-                            if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
-                                prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
-                                found_fp32_128 = true;
+                            if (prop.workgroupInvocations == 256 &&
+                                prop.MGranularity <= 32 &&
+                                prop.NGranularity <= 32 &&
+                                prop.KGranularity <= 16) {
+                                if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
+                                    prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
+                                    found_fp16_256 = true;
+                                }
+                                if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
+                                    prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
+                                    found_fp32_256 = true;
+                                }
                             }
                         }
-                        if (prop.workgroupInvocations == 256 &&
-                            prop.MGranularity <= 32 &&
-                            prop.NGranularity <= 32 &&
-                            prop.KGranularity <= 16) {
-                            if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
-                                prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
-                                found_fp16_256 = true;
+
+#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
+                        if (prop.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR &&
+                            prop.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR &&
+                            prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
+                            prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
+
+                            if (prop.workgroupInvocations == 128 &&
+                                prop.MGranularity <= 32 &&
+                                prop.NGranularity <= 16 &&
+                                prop.KGranularity <= 16) {
+                                found_bf16_128 = true;
                             }
-                            if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
-                                prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
-                                found_fp32_256 = true;
+                            if (prop.workgroupInvocations == 256 &&
+                                prop.MGranularity <= 32 &&
+                                prop.NGranularity <= 32 &&
+                                prop.KGranularity <= 16) {
+                                found_bf16_256 = true;
                             }
                         }
+#endif
                     }
                 }
                 if (found_fp16_128 && found_fp16_256 &&
                     found_fp32_128 && found_fp32_256 &&
                     coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) {
                     device->coopmat2 = true;
+                    device->coopmat2_bf16_support = found_bf16_128 && found_bf16_256;
                     device->coopmat2_decode_vector = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector;
                 }
             }
@@ -9448,7 +9513,8 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
     const uint32_t Br = params.block_rows;
     const uint32_t Bc = params.block_cols;
 
-    const uint32_t float_type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
+    // BF16 uses the fp32 shader (FLOAT_TYPE=float)
+    const uint32_t float_type_size = (device->fp16 && k_type != GGML_TYPE_BF16) ? sizeof(ggml_fp16_t) : sizeof(float);
 
     const bool mmq = ggml_vk_fa_scalar_uses_mmq(device, k_type);
 
@@ -9489,7 +9555,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
     return supported;
 }
 
-static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc) {
+static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type) {
     // Needs to be kept up to date on shader changes
     const uint32_t Br = params.block_rows;
     const uint32_t Bc = params.block_cols;
@@ -9519,8 +9585,10 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co
     const uint32_t vsh_stride = MatBc / 4 * row_split;
     const uint32_t ksh = ((kvshstride >= vsh_stride) ? (Bc * kvshstride) : (Bc * vsh_stride)) * f16vec4;
 
+    // BF16 PVMat accumulator is f32 (no bf16 accumulator support), so pvsh is vec4 (16 bytes)
+    const uint32_t pvsh_elem_size = (k_type == GGML_TYPE_BF16) ? 16u : f16vec4;
     const uint32_t osh_stride = params.row_split * MatBr / 4;
-    const uint32_t pvsh = MatBc * osh_stride * f16vec4;
+    const uint32_t pvsh = MatBc * osh_stride * pvsh_elem_size;
 
     const uint32_t slope = Br * acctype;
 
@@ -9589,7 +9657,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
     uint32_t workgroups_y = (uint32_t)neq2;
     uint32_t workgroups_z = (uint32_t)neq3;
 
-    const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32;
+    const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32 || k->type == GGML_TYPE_BF16;
 
     // For scalar/coopmat1 FA, we can use the "large" size to accommodate qga.
     // For coopmat2 FA, we always use the small size (which is still pretty large for gqa).
@@ -16400,6 +16468,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                     switch (t) {
                     case GGML_TYPE_F32:
                     case GGML_TYPE_F16:
+                    case GGML_TYPE_BF16:
                     case GGML_TYPE_Q8_0:
                     case GGML_TYPE_Q5_1:
                     case GGML_TYPE_Q5_0:
@@ -16415,6 +16484,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                 if (!fa_kv_ok(op->src[1]->type) || !fa_kv_ok(op->src[2]->type)) {
                     return false;
                 }
+                if ((op->src[1]->type == GGML_TYPE_BF16) != (op->src[2]->type == GGML_TYPE_BF16)) {
+                    return false;
+                }
                 if (!coopmat2 && !(device->subgroup_shuffle && device->subgroup_vote)) {
                     // scalar/coopmat1 FA uses subgroupShuffle/subgroupAll
                     return false;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
index 9a7957da97b..66dcf610219 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
@@ -97,8 +97,17 @@ layout (binding = 6) readonly buffer MO {uint32_t data_mask_opt[];};
 #define FA_TYPE_Q5_0  6u
 #define FA_TYPE_Q5_1  7u
 #define FA_TYPE_Q8_0  8u
+#define FA_TYPE_BF16 30u
 #define FA_TYPE_Q1_0 41u
 
+#if defined(BFLOAT16)
+#define O_TYPE float
+#define O_TYPEV4 vec4
+#else
+#define O_TYPE FLOAT_TYPE
+#define O_TYPEV4 FLOAT_TYPEV4
+#endif
+
 // Number of matrix elements per buffer block, derived from the K/V type spec
 // constant. F32 is treated as a vec4 "block" of 4 floats. F16 uses block size 1
 // and bypasses the dequant path entirely. Quants follow their ggml block sizes.
@@ -111,6 +120,7 @@ uint fa_block_elems(uint ty) {
         case FA_TYPE_Q5_0: return uint(QUANT_K_Q5_0);
         case FA_TYPE_Q5_1: return uint(QUANT_K_Q5_1);
         case FA_TYPE_Q8_0: return uint(QUANT_K_Q8_0);
+        case FA_TYPE_BF16: return 1u;
         case FA_TYPE_Q1_0: return uint(QUANT_K_Q1_0); // cm2-only, harmless elsewhere
         default:           return 1u;
     }
@@ -248,7 +258,7 @@ const float FATTN_KQ_MAX_OFFSET = 3.0f*0.6931f;
 
 // Store the output when doing grouped query attention.
 // Rows index by Q's dimension 2, and the first N rows are valid.
-void gqaStore(const in uint32_t r, const in uint32_t c, const in FLOAT_TYPEV4 elems, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
+void gqaStore(const in uint32_t r, const in uint32_t c, const in O_TYPEV4 elems, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
 {
     uint32_t offset = (iq2 + r) * HSV / 4 + c;
     data_ov4[o_offset + offset] = D_TYPEV4(elems);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
index bffcc095be3..23ae3833e52 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
@@ -6,6 +6,10 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
 #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
 
+#if defined(BFLOAT16)
+#extension GL_EXT_bfloat16 : enable
+#endif
+
 #extension GL_KHR_shader_subgroup_basic : enable
 #extension GL_KHR_shader_subgroup_arithmetic : enable
 #extension GL_KHR_shader_subgroup_vote : enable
@@ -14,7 +18,9 @@
 
 #include "types.glsl"
 #include "flash_attn_base.glsl"
+#if !defined(BFLOAT16)
 #include "flash_attn_dequant.glsl"
+#endif
 
 // These need to be supported N,M values for a MatBc x MatBr x 16 coopmatmuladd
 const uint32_t MatBr = 16;
@@ -27,32 +33,32 @@ const uint32_t cols_per_thread = Bc / cols_per_iter;
 
 layout (binding = 0) readonly buffer Q {float data_q[];};
 layout (binding = 0) readonly buffer QV4 {vec4 data_qv4[];};
-layout (binding = 1) readonly buffer K {float16_t data_k[];};
-layout (binding = 1) readonly buffer KV4 {f16vec4 data_kv4[];};
-layout (binding = 2) readonly buffer V {float16_t data_v[];};
-layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
+layout (binding = 1) readonly buffer K {FLOAT_TYPE data_k[];};
+layout (binding = 1) readonly buffer KV4 {FLOAT_TYPEV4 data_kv4[];};
+layout (binding = 2) readonly buffer V {FLOAT_TYPE data_v[];};
+layout (binding = 2) readonly buffer VV4 {FLOAT_TYPEV4 data_vv4[];};
 layout (binding = 3) readonly buffer M {float16_t data_m[];};
 
 shared float tmpsh[row_split];
 
-const uint32_t qstride = HSK_pad / 4 + 2; // in units of f16vec4
-shared f16vec4 Qf[Br * qstride];
+const uint32_t qstride = HSK_pad / 4 + 2;
+shared FLOAT_TYPEV4 Qf[Br * qstride];
 
 const uint psh_stride = Br / 4 + 2;
-shared f16vec4 Psh[Bc * psh_stride];
+shared FLOAT_TYPEV4 Psh[Bc * psh_stride];
 
 // Avoid padding for hsk==256 to make it fit in 48KB shmem.
 const uint32_t sfshstride = (HSK <= 128) ? (Br / 4 + 2) : Br / 4;
 shared ACC_TYPEV4 sfsh[Bc * sfshstride];
 
 const uint32_t D_pad = HSK_pad > HSV_pad ? HSK_pad : HSV_pad;
-const uint32_t kvsh_stride = (SHMEM_STAGING != 0 ? D_pad : MatBr) / 4 + 2; // in units of f16vec4
+const uint32_t kvsh_stride = (SHMEM_STAGING != 0 ? D_pad : MatBr) / 4 + 2;
 const uint v_cols = MatBc / 4 * row_split; // total cols, 4 vec4s per MatBc * number of subgroups
 const uint vsh_stride = v_cols;
-shared f16vec4 kvsh[(kvsh_stride >= vsh_stride) ? (Bc * kvsh_stride) : (Bc * vsh_stride)];
+shared FLOAT_TYPEV4 kvsh[(kvsh_stride >= vsh_stride) ? (Bc * kvsh_stride) : (Bc * vsh_stride)];
 
 const uint32_t osh_stride = row_split * MatBr / 4;
-shared f16vec4 pvsh[MatBc * osh_stride];
+shared O_TYPEV4 pvsh[MatBc * osh_stride];
 
 shared ACC_TYPE slope[Br];
 
@@ -76,7 +82,7 @@ void main() {
     if ((HSK % 16) != 0) {
         [[unroll]] for (uint i = 0; i < Br * qstride; i += gl_WorkGroupSize.x) {
             if (i + tid < Br * qstride) {
-                Qf[i + tid] = f16vec4(0);
+                Qf[i + tid] = FLOAT_TYPEV4(0);
             }
         }
         barrier();
@@ -89,15 +95,15 @@ void main() {
         uint32_t r = (idx + tid) / (HSK / 4);
         if (r < Br && d < HSK / 4 &&
             i * Br + r < N) {
-            Qf[r * qstride + d] = f16vec4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d] * p.scale);
+            Qf[r * qstride + d] = FLOAT_TYPEV4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d] * p.scale);
         }
     }
     barrier();
 
-    f16vec4 Of[rows_per_thread][d_per_thread];
+    O_TYPEV4 Of[rows_per_thread][d_per_thread];
     [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
         [[unroll]] for (uint32_t d = 0; d < d_per_thread; ++d) {
-            Of[r][d] = f16vec4(0.0);
+            Of[r][d] = O_TYPEV4(0.0);
         }
     }
 
@@ -222,15 +228,18 @@ void main() {
                 uint32_t d = (idx + tid) % (HSK_pad / 4);
                 uint32_t c = (idx + tid) / (HSK_pad / 4);
                 if (idx + gl_WorkGroupSize.x <= Bc * HSK_pad / 4 || c < Bc) {
-                    f16vec4 K_Tf = f16vec4(0);
+                    FLOAT_TYPEV4 K_Tf = FLOAT_TYPEV4(0);
                     if ((!KV_bounds_check || j * Bc + c < KV) && (HSK == HSK_pad || d < HSK / 4)) {
+#if !defined(BFLOAT16)
                         if (USE_DECODE_K) {
                             uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE_K + 4 * d;
                             uint ib = coord / BLOCK_SIZE_K;
                             uint iqs = (coord % BLOCK_SIZE_K);
                             K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
-                        } else {
-                            K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
+                        } else
+#endif
+                        {
+                            K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
                         }
                     }
 
@@ -244,16 +253,16 @@ void main() {
         // Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16
         // This is written transposed in order to allow for N being 8 if implementations need it
         coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> SfMat = coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
-        coopmat<float16_t, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
-        coopmat<float16_t, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;
+        coopmat<FLOAT_TYPE, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
+        coopmat<FLOAT_TYPE, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;
 
         [[unroll]] for (uint32_t d = 0; d < HSK_pad / 16; ++d) {
             // If SHMEM_STAGING is set, a Bc * HSK_pad size tile of K is loaded to shmem
-            // If not, f16 K is loaded directly from global memory if aligned, otherwise
+            // If not, K is loaded directly from global memory if aligned, otherwise
             // staged through a Bc * MatBr size staging buffer.
-            // If K is not type f16, then it is always staged for dequantization.
+            // If K is a quant type, then it is always staged for dequantization.
             if (SHMEM_STAGING == 0) {
-            // For quants we always need to dequant into kvsh; for f16 we can load
+            // For quants we always need to dequant into kvsh; for f16/bf16 we can load
             // directly from global memory when alignment / bounds allow it.
             const bool stage_k = USE_DECODE_K || KV_bounds_check || d * 16 + 16 > HSK;
             if (stage_k) {
@@ -262,15 +271,18 @@ void main() {
                     uint32_t col_vec = (idx + tid) % (MatBr / 4);
                     uint32_t row = (idx + tid) / (MatBr / 4);
                     if (idx + tid < Bc * MatBr / 4) {
-                        f16vec4 K_Tf = f16vec4(0);
+                        FLOAT_TYPEV4 K_Tf = FLOAT_TYPEV4(0);
                         if ((!KV_bounds_check || j * Bc + row < KV) && (HSK == HSK_pad || d * 16 + col_vec * 4 < HSK)) {
+#if !defined(BFLOAT16)
                             if (USE_DECODE_K) {
                                 uint coord = (j * Bc + row) * k_stride * BLOCK_SIZE_K + d * 16 + col_vec * 4;
                                 uint ib = coord / BLOCK_SIZE_K;
                                 uint iqs = (coord % BLOCK_SIZE_K);
                                 K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
-                            } else {
-                                K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + row) * k_stride / 4 + d * 16 / 4 + col_vec]);
+                            } else
+#endif
+                            {
+                                K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + row) * k_stride / 4 + d * 16 / 4 + col_vec]);
                             }
                         }
 
@@ -357,7 +369,7 @@ void main() {
         [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
             const uint d_local = d0 / threads_per_rowgroup;
             [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-                Of[r][d_local] = float16_t(eMf[r]) * Of[r][d_local];
+                Of[r][d_local] = O_TYPE(eMf[r]) * Of[r][d_local];
             }
         }
 
@@ -368,10 +380,10 @@ void main() {
             [[unroll]] for (uint32_t r = 0; r < rows_per_thread; r += 4) {
                 const uint row = tile_row(r);
                 if (KV_bounds_check && j * Bc + col >= KV) {
-                    Psh[col * psh_stride + row / 4] = f16vec4(0.0f);
+                    Psh[col * psh_stride + row / 4] = FLOAT_TYPEV4(0.0f);
                 } else {
                     const vec4 mfvec = vec4(Mf[r], Mf[r + 1], Mf[r + 2], Mf[r + 3]);
-                    const f16vec4 Pf = f16vec4(exp(vec4(sfsh[row / 4 + col * sfshstride]) - mfvec));
+                    const FLOAT_TYPEV4 Pf = FLOAT_TYPEV4(exp(vec4(sfsh[row / 4 + col * sfshstride]) - mfvec));
                     [[unroll]] for (uint32_t vec_idx = 0; vec_idx < 4; ++vec_idx) {
                         Lf[r + vec_idx] += Pf[vec_idx];
                     }
@@ -385,15 +397,18 @@ void main() {
                 uint32_t d = (idx + tid) % (HSV_pad / 4);
                 uint32_t c = (idx + tid) / (HSV_pad / 4);
                 if (idx + gl_WorkGroupSize.x <= Bc * HSV_pad / 4 || c < Bc) {
-                    f16vec4 V_Tf = f16vec4(0);
+                    FLOAT_TYPEV4 V_Tf = FLOAT_TYPEV4(0);
                     if ((!KV_bounds_check || j * Bc + c < KV) && (HSV == HSV_pad || d < HSV / 4)) {
+#if !defined(BFLOAT16)
                         if (USE_DECODE_V) {
                             uint coord = (j * Bc + c) * v_stride * BLOCK_SIZE_V + 4 * d;
                             uint ib = coord / BLOCK_SIZE_V;
                             uint iqs = (coord % BLOCK_SIZE_V);
                             V_Tf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
-                        } else {
-                            V_Tf = f16vec4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]);
+                        } else
+#endif
+                        {
+                            V_Tf = FLOAT_TYPEV4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]);
                         }
                     }
 
@@ -409,7 +424,7 @@ void main() {
         [[unroll]] for (uint32_t hsv_tile = 0; hsv_tile < num_hsv_tiles; ++hsv_tile) {
             const uint hsv_offset = (hsv_tile * row_split + gl_SubgroupID) * 16;
 
-            coopmat<float16_t, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> PVMat = coopmat<float16_t, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
+            coopmat<O_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> PVMat = coopmat<O_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
 
             // Preload V tiles for [Bc, 16 * num subgroups]
             const uint v_rows = Bc;
@@ -417,11 +432,11 @@ void main() {
             const uint v_loads_per_thread = v_total / gl_WorkGroupSize.x;
 
             // If SHMEM_STAGING is set, a Bc * HSV_pad size tile of V is loaded to shmem.
-            // If not, f16 V is loaded directly from global memory if aligned, otherwise
+            // If not, V is loaded directly from global memory if aligned, otherwise
             // staged through a Bc * MatBr size staging buffer.
-            // If V is not type f16, then it is always staged for dequantization.
+            // If V is a quant type, then it is always staged for dequantization.
             if (SHMEM_STAGING == 0) {
-            // For quants we always preload via kvsh. For f16 we only preload when
+            // For quants we always preload via kvsh. For f16/bf16 we only preload when
             // alignment / bounds force it (otherwise we coopMatLoad direct from data_vv4).
             const bool stage_v = USE_DECODE_V || KV_bounds_check;
             if (stage_v) {
@@ -438,13 +453,16 @@ void main() {
                     const uint iqs = coord % BLOCK_SIZE_V;
 
                     if (!KV_bounds_check || (v_row < KV && v_col < HSV)) {
+#if !defined(BFLOAT16)
                         if (USE_DECODE_V) {
                             kvsh[row * vsh_stride + col] = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
-                        } else {
+                        } else
+#endif
+                        {
                             kvsh[row * vsh_stride + col] = data_vv4[(v_offset + v_row * v_stride + v_col) / 4];
                         }
                     } else {
-                        kvsh[row * vsh_stride + col] = f16vec4(0.0f);
+                        kvsh[row * vsh_stride + col] = FLOAT_TYPEV4(0.0f);
                     }
                 }
             }
@@ -459,7 +477,7 @@ void main() {
 
                     if (SHMEM_STAGING == 0) {
                     if (!USE_DECODE_V && !KV_bounds_check) {
-                        // F16 values can be loaded directly from global memory
+                        // F16/BF16 values can be loaded directly from global memory
                         const uint v_tile_row = j * Bc + bc_chunk * MatBc;
                         const uint v_tile_offset = v_offset / 4 + v_tile_row * v_stride / 4 + hsv_offset / 4;
                         coopMatLoad(QMat, data_vv4, v_tile_offset, v_stride / 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -573,7 +591,7 @@ void main() {
 
                 [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
                     const uint d_local = d0 / threads_per_rowgroup;
-                    Of[r][d_local] *= float16_t(ms);
+                    Of[r][d_local] *= O_TYPE(ms);
                 }
             } else {
                 vs = exp(sink - Mf[r]);
@@ -591,7 +609,7 @@ void main() {
     [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
         const uint d_local = d0 / threads_per_rowgroup;
         [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-            Of[r][d_local] *= float16_t(Lfrcp[r]);
+            Of[r][d_local] *= O_TYPE(Lfrcp[r]);
 #if defined(FLOAT_TYPE_MAX)
             Of[r][d_local] = clamp(Of[r][d_local], -FLOAT_TYPE_MAX, FLOAT_TYPE_MAX);
 #endif
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
index 6d45b4931df..b9c03fe499d 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -8,6 +8,10 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
 #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
 
+#if defined(BFLOAT16)
+#extension GL_EXT_bfloat16 : enable
+#endif
+
 #extension GL_KHR_memory_scope_semantics : enable
 #extension GL_KHR_cooperative_matrix : enable
 #extension GL_NV_cooperative_matrix2 : enable
@@ -21,7 +25,9 @@
 
 #include "types.glsl"
 #include "flash_attn_base.glsl"
+#if !defined(BFLOAT16)
 #include "dequant_funcs_cm2.glsl"
+#endif
 
 // buffer_reference stride = sizeof(struct) = FaBlockBytesK/V.
 layout(buffer_reference, std430, buffer_reference_align = 1) buffer decodeBufFA_K {
@@ -31,6 +37,7 @@ layout(buffer_reference, std430, buffer_reference_align = 1) buffer decodeBufFA_
     uint8_t raw[FaBlockBytesV];
 };
 
+#if !defined(BFLOAT16)
 float16_t faDecodeK(const decodeBufFA_K bl_in, const uint blockCoords[2], const uint coordInBlock[2]) {
     switch (FaTypeK) {
         case FA_TYPE_F32:  return dequantFuncF32 (decodeBufF32 (bl_in), blockCoords, coordInBlock);
@@ -91,6 +98,7 @@ f16vec4 faDecodeVVector(const decodeBufFA_V bl_in, const uint blockCoords[2], co
 #define FADECODEK , faDecodeK
 #define FADECODEV , faDecodeV
 #endif
+#endif
 
 layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
 layout (binding = 1) readonly buffer K {uint8_t data_k[];};
@@ -195,15 +203,15 @@ void main() {
     tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1);
 
     coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseAccumulator> Q;
-    coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA> Qf16;
+    coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA> Qf16;
 
     uint32_t q_offset = gqa_iq1*p.nb01*4/*sizeof(float)*/ + iq2*p.nb02+iq3*p.nb03;
     coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK_pad));
 
-    Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA>(Q);
-    Qf16 *= float16_t(p.scale);
+    Q *= Q_TYPE(p.scale);
+    Qf16 = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA>(Q);
 
-    coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
+    coopmat<O_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O = coopmat<O_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
 
     coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
 
@@ -291,16 +299,20 @@ void main() {
 
         coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
 
-        coopmat<float16_t, gl_ScopeWorkgroup, HSK_pad, Bc, gl_MatrixUseB> K_T;
+        coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, HSK_pad, Bc, gl_MatrixUseB> K_T;
 
         uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13;
         // F16: bs_k==1 (direct load). F32: bs_k==4 (vec4 / dequantFuncF32). Q4/Q8 family: bs_k==32. Q1_0: bs_k==128.
+#if defined(BFLOAT16)
+        coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose);
+#else
         const bool k_use_decode = (bs_k > 1u);
         if (k_use_decode) {
             coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose FADECODEK);
         } else {
             coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose);
         }
+#endif
         S = coopMatMulAdd(Qf16, K_T, S);
 
         if (LOGIT_SOFTCAP) {
@@ -351,22 +363,26 @@ void main() {
             coopMatPerElementNV(P, P, replacePadding, ACC_TYPE(0.0), R, C);
         }
 
-        coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA> P_A = coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA>(P);
+        coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA> P_A = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA>(P);
 
         // compute rowsum by multiplying by matrix of all ones.
-        coopmat<float16_t, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB> One = coopmat<float16_t, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB>(1.0);
+        coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB> One = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB>(1.0);
 
         rowsum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0.0);
         rowsum = coopMatMulAdd(P_A, One, rowsum);
 
-        coopmat<float16_t, gl_ScopeWorkgroup, Bc, HSV_pad, gl_MatrixUseB> V;
+        coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Bc, HSV_pad, gl_MatrixUseB> V;
         uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
+#if defined(BFLOAT16)
+        coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad));
+#else
         const bool v_use_decode = (bs_v > 1u);
         if (v_use_decode) {
             coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad) FADECODEV);
         } else {
             coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad));
         }
+#endif
 
         L = eM*L + rowsum;
 
@@ -378,7 +394,7 @@ void main() {
         // resize eM by using smear/reduce
         coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
 
-        O *= coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(eMdiag);
+        O *= coopmat<O_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(eMdiag);
         O = coopMatMulAdd(P_A, V, O);
     }
 
@@ -427,7 +443,7 @@ void main() {
             if (sink > Mr[i]) {
                 ms = exp(Mr[i] - sink);
 
-                O[i] *= float16_t(ms);
+                O[i] *= O_TYPE(ms);
             } else {
                 vs = exp(sink - Mr[i]);
             }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl
index 02106f33cbe..8704479d960 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl
@@ -28,6 +28,9 @@ layout (binding = 2) readonly buffer V_PACKED_Q5_1 { block_q5_1_packed16 data[];
 layout (binding = 1) readonly buffer K_PACKED_Q8_0 { block_q8_0_packed16 data[]; } k_packed_q8_0;
 layout (binding = 2) readonly buffer V_PACKED_Q8_0 { block_q8_0_packed16 data[]; } v_packed_q8_0;
 
+layout (binding = 1) readonly buffer K_PACKED_BF16 { u16vec4 data[]; } k_packed_bf16;
+layout (binding = 2) readonly buffer V_PACKED_BF16 { u16vec4 data[]; } v_packed_bf16;
+
 // Q4_1 and Q5_1 packed32 views: aliased to the same memory as the packed16
 // views, used by the MMQ K-side hot path for fast 4-uint loads.
 layout (binding = 1) readonly buffer K_PACKED_Q4_1_P32 { block_q4_1_packed32 data[]; } k_packed_q4_1_p32;
@@ -99,6 +102,9 @@ layout (binding = 1) readonly buffer K_PACKED_Q5_1_P32 { block_q5_1_packed32 dat
     return FLOAT_TYPE(BUF.data[a_offset + ib].d) * FLOAT_TYPEV4(v0.x, v0.y, v1.x, v1.y);          \
 }
 
+#define FA_DEQUANT4_BF16(BUF) \
+    return FLOAT_TYPEV4(bf16_to_fp32(uvec4(BUF.data[(a_offset + ib) / 4])));
+
 FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
     if (binding_idx == BINDING_IDX_K) {
         switch (FaTypeK) {
@@ -108,6 +114,7 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
             case FA_TYPE_Q5_0: FA_DEQUANT4_Q5_0(k_packed_q5_0)
             case FA_TYPE_Q5_1: FA_DEQUANT4_Q5_1(k_packed_q5_1)
             case FA_TYPE_Q8_0: FA_DEQUANT4_Q8_0(k_packed_q8_0)
+            case FA_TYPE_BF16: FA_DEQUANT4_BF16(k_packed_bf16)
         }
     } else {
         switch (FaTypeV) {
@@ -117,6 +124,7 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
             case FA_TYPE_Q5_0: FA_DEQUANT4_Q5_0(v_packed_q5_0)
             case FA_TYPE_Q5_1: FA_DEQUANT4_Q5_1(v_packed_q5_1)
             case FA_TYPE_Q8_0: FA_DEQUANT4_Q8_0(v_packed_q8_0)
+            case FA_TYPE_BF16: FA_DEQUANT4_BF16(v_packed_bf16)
         }
     }
     return FLOAT_TYPEV4(0);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index fa9b938e4f7..de7dbec2c63 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -662,6 +662,28 @@ void process_shaders() {
         }
     }
 
+    const std::map<std::string, std::string> fa_bf16_dict = {
+        {"FLOAT_TYPE",   "bfloat16_t"},
+        {"FLOAT_TYPEV2", "bf16vec2"},
+        {"FLOAT_TYPEV4", "bf16vec4"},
+        {"ACC_TYPE",     "float"},
+        {"ACC_TYPEV2",   "vec2"},
+        {"ACC_TYPEV4",   "vec4"},
+        {"BFLOAT16",     "1"},
+    };
+
+#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    string_to_spv("flash_attn_f32_f16_bf16", "flash_attn_cm1.comp",
+        merge_maps(fa_bf16_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"COOPMAT", "1"}}),
+        true, true, false, false);
+#endif
+
+#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+    string_to_spv("flash_attn_f32_f16_bf16", "flash_attn_cm2.comp",
+        merge_maps(fa_bf16_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}}),
+        true, false, true, false);
+#endif
+
     std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}, {"FLOAT_TYPEV2", "vec2"}};
 
     for (const auto& tname : type_names) {

From d48a56effbba99944a677938d4beb63a0065ecdf Mon Sep 17 00:00:00 2001
From: Jinyang He <hejinyang@loongson.cn>
Date: Sat, 30 May 2026 16:53:26 +0800
Subject: [PATCH 46/50] ggml : add some lsx support (#23798)

* loongarch : optimize LSX fp16 load/store with native intrinsics

Use __lsx_vfcvtl_s_h and __lsx_vfcvt_h_s instead of scalar loops in
__lsx_f16x4_load and __lsx_f16x4_store.

* loongarch : add LSX implementation for q8_0 dot product

* loongarch : add LSX implementation for q6_K dot product

* loongarch : add LSX implementation for iq4_xs dot product

* Improve reduce ops when sun int16 pairs to int32
---
 ggml/src/ggml-cpu/arch/loongarch/quants.c | 151 ++++++++++++++++++++++
 ggml/src/ggml-cpu/simd-mappings.h         |  19 +--
 2 files changed, 154 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/loongarch/quants.c b/ggml/src/ggml-cpu/arch/loongarch/quants.c
index 74e0c086c6d..9c43da6cf89 100644
--- a/ggml/src/ggml-cpu/arch/loongarch/quants.c
+++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c
@@ -977,6 +977,35 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
     sumf = hsum_float_8(acc);
 
     *s = sumf;
+
+#elif defined(__loongarch_sx)
+
+    __m128 acc = (__m128)__lsx_vldi(0);
+
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+        const __m128i qx_0 = __lsx_vld((const __m128i *)x[ib].qs, 0);
+        const __m128i qx_1 = __lsx_vld((const __m128i *)x[ib].qs + 1, 0);
+        const __m128i qy_0 = __lsx_vld((const __m128i *)y[ib].qs, 0);
+        const __m128i qy_1 = __lsx_vld((const __m128i *)y[ib].qs + 1, 0);
+
+        const __m128i p16_0 = lsx_maddubs_h(qx_0, qy_0);
+        const __m128i p16_1 = lsx_maddubs_h(qx_1, qy_1);
+
+        // Sum int16 pairs → int32
+        const __m128i s_0 = __lsx_vaddwev_w_h(p16_0, p16_1);
+        const __m128i s_1 = __lsx_vaddwod_w_h(p16_0, p16_1);
+
+        const __m128 q = __lsx_vffint_s_w(__lsx_vadd_w(s_0, s_1));
+        acc = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(d), q, acc);
+    }
+
+    __m128 res = lsx_hadd_s(acc, acc);
+    res = lsx_hadd_s(res, res);
+    sumf = ((v4f32)res)[0];
+
+    *s = sumf;
+
 #else
     UNUSED(nb);
     UNUSED(ib);
@@ -1443,6 +1472,99 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     *s = hsum_float_8(acc);
 
+#elif defined(__loongarch_sx)
+
+    const __m128i m32s = __lsx_vreplgr2vr_b(32);
+
+    __m128 acc_0 = (__m128)__lsx_vldi(0);
+    __m128 acc_1 = (__m128)__lsx_vldi(0);
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const __m128i scale_i8 = __lsx_vld(x[i].scales, 0);
+        const __m128i scales_lo = __lsx_vsllwil_h_b(scale_i8, 0);
+        const __m128i scales_hi = __lsx_vsllwil_h_b(__lsx_vbsrl_v(scale_i8, 8), 0);
+
+        __m128i sumi_0 = __lsx_vldi(0);
+        __m128i sumi_1 = __lsx_vldi(0);
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m128i q4bitsH_0 = __lsx_vld((const __m128i*)qh, 0); qh += 16;
+            const __m128i q4bitsH_1 = __lsx_vld((const __m128i*)qh, 0); qh += 16;
+
+            const __m128i q4h_0 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_0, 3), 4);
+            const __m128i q4h_1 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_1, 3), 4);
+            const __m128i q4h_2 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_0, 3 << 2), 2);
+            const __m128i q4h_3 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_1, 3 << 2), 2);
+            const __m128i q4h_4 = __lsx_vandi_b(q4bitsH_0, 3 << 4);
+            const __m128i q4h_5 = __lsx_vandi_b(q4bitsH_1, 3 << 4);
+            const __m128i q4h_6 = __lsx_vsrli_b(__lsx_vandi_b(q4bitsH_0, 3 << 6), 2);
+            const __m128i q4h_7 = __lsx_vsrli_b(__lsx_vandi_b(q4bitsH_1, 3 << 6), 2);
+
+            const __m128i q4bits1_0 = __lsx_vld((const __m128i*)q4, 0); q4 += 16;
+            const __m128i q4bits1_1 = __lsx_vld((const __m128i*)q4, 0); q4 += 16;
+            const __m128i q4bits2_0 = __lsx_vld((const __m128i*)q4, 0); q4 += 16;
+            const __m128i q4bits2_1 = __lsx_vld((const __m128i*)q4, 0); q4 += 16;
+
+            const __m128i q4_0 = __lsx_vor_v(__lsx_vandi_b(q4bits1_0, 0xf), q4h_0);
+            const __m128i q4_1 = __lsx_vor_v(__lsx_vandi_b(q4bits1_1, 0xf), q4h_1);
+            const __m128i q4_2 = __lsx_vor_v(__lsx_vandi_b(q4bits2_0, 0xf), q4h_2);
+            const __m128i q4_3 = __lsx_vor_v(__lsx_vandi_b(q4bits2_1, 0xf), q4h_3);
+            const __m128i q4_4 = __lsx_vor_v(__lsx_vsrli_b(q4bits1_0, 4), q4h_4);
+            const __m128i q4_5 = __lsx_vor_v(__lsx_vsrli_b(q4bits1_1, 4), q4h_5);
+            const __m128i q4_6 = __lsx_vor_v(__lsx_vsrli_b(q4bits2_0, 4), q4h_6);
+            const __m128i q4_7 = __lsx_vor_v(__lsx_vsrli_b(q4bits2_1, 4), q4h_7);
+
+            const __m128i q8_0 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_1 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_2 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_3 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_4 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_5 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_6 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_7 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+
+            __m128i p16_0 = lsx_maddubs_h(__lsx_vsub_b(q4_0, m32s), q8_0);
+            __m128i p16_1 = lsx_maddubs_h(__lsx_vsub_b(q4_1, m32s), q8_1);
+            __m128i p16_2 = lsx_maddubs_h(__lsx_vsub_b(q4_2, m32s), q8_2);
+            __m128i p16_3 = lsx_maddubs_h(__lsx_vsub_b(q4_3, m32s), q8_3);
+            __m128i p16_4 = lsx_maddubs_h(__lsx_vsub_b(q4_4, m32s), q8_4);
+            __m128i p16_5 = lsx_maddubs_h(__lsx_vsub_b(q4_5, m32s), q8_5);
+            __m128i p16_6 = lsx_maddubs_h(__lsx_vsub_b(q4_6, m32s), q8_6);
+            __m128i p16_7 = lsx_maddubs_h(__lsx_vsub_b(q4_7, m32s), q8_7);
+
+            const __m128i sc_vec = j == 0 ? scales_lo : scales_hi;
+
+            p16_0 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 0), p16_0);
+            p16_1 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 1), p16_1);
+            p16_2 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 2), p16_2);
+            p16_3 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 3), p16_3);
+            p16_4 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 4), p16_4);
+            p16_5 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 5), p16_5);
+            p16_6 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 6), p16_6);
+            p16_7 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 7), p16_7);
+
+            sumi_0 = __lsx_vadd_w(sumi_0, __lsx_vadd_w(p16_0, p16_2));
+            sumi_1 = __lsx_vadd_w(sumi_1, __lsx_vadd_w(p16_1, p16_3));
+            sumi_0 = __lsx_vadd_w(sumi_0, __lsx_vadd_w(p16_4, p16_6));
+            sumi_1 = __lsx_vadd_w(sumi_1, __lsx_vadd_w(p16_5, p16_7));
+        }
+
+        __m128 p_0 = __lsx_vfmul_s(__lsx_vreplfr2vr_s(d), __lsx_vffint_s_w(sumi_0));
+        __m128 p_1 = __lsx_vfmul_s(__lsx_vreplfr2vr_s(d), __lsx_vffint_s_w(sumi_1));
+        acc_0 = __lsx_vfadd_s(p_0, acc_0);
+        acc_1 = __lsx_vfadd_s(p_1, acc_1);
+    }
+
+    *s = hsum_float_4x4(acc_0, acc_1, (__m128)__lsx_vldi(0), (__m128)__lsx_vldi(0));
+
 #else
     UNUSED(x);
     UNUSED(y);
@@ -2149,6 +2271,35 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
 
     *s = hsum_float_8(accum);
 
+#elif defined(__loongarch_sx)
+
+    const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
+
+    __m128 accum = (__m128)__lsx_vldi(0);
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        uint16_t sh = x[ibl].scales_h;
+        __m128i sumi = __lsx_vldi(0);
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const __m128i q4bits = __lsx_vld((const __m128i*)qs, 0); qs += 16;
+            const __m128i q8b_0 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8b_1 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q4b_0 = __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits, 0xf));
+            const __m128i q4b_1 = __lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits, 4));
+            const __m128i p16_0 = lsx_maddubs_h(q4b_0, q8b_0);
+            const __m128i p16_1 = lsx_maddubs_h(q4b_1, q8b_1);
+            const int16_t ls = (((x[ibl].scales_l[ib/2] >> ((ib & 1) * 4)) & 0xf) | ((sh & 0x3) << 4)) - 32;
+            sh >>= 2;
+            sumi = __lsx_vadd_w(lsx_madd_h(p16_0, __lsx_vreplgr2vr_h(ls)), sumi);
+            sumi = __lsx_vadd_w(lsx_madd_h(p16_1, __lsx_vreplgr2vr_h(ls)), sumi);
+        }
+        const float ds = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        accum = __lsx_vfadd_s(__lsx_vfmul_s(__lsx_vreplfr2vr_s(ds), __lsx_vffint_s_w(sumi)), accum);
+    }
+
+    *s = ((v4f32)lsx_hadd_s(lsx_hadd_s(accum, accum), lsx_hadd_s(accum, accum)))[0];
+
 #else
     UNUSED(x);
     UNUSED(y);
diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index 0deda930985..62e687201ef 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -1125,25 +1125,12 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
 #define GGML_F16_EPR  4
 
 static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
-    float tmp[4];
-
-    tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
-    tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
-    tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
-    tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
-
-    return (__m128)__lsx_vld(tmp, 0);
+    return __lsx_vfcvtl_s_h(__lsx_vld((const void *)x, 0));
 }
 
 static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
-    float arr[4];
-
-    __lsx_vst(y, arr, 0);
-
-    x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
-    x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
-    x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
-    x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
+    __m128i a = __lsx_vfcvt_h_s(y, y);
+    memcpy(x, &a, sizeof(ggml_fp16_t) * 4);
 }
 
 #define GGML_F32Cx4             __m128

From 4c4e91b799c206fddaa56d89f0b4e61f6a263a4e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 30 May 2026 13:21:46 +0300
Subject: [PATCH 47/50] ci : update ios-xcode release job to macos-26 (#23906)

* ci : disable libcommon build from xcframework

* ocd : fix name

* ci : ios-xcode change to macos-26

* cont : pin xcode

* cont : pin xcode to minor version
---
 .github/workflows/release.yml | 58 +++++++++++++++++------------------
 build-xcframework.sh          |  2 ++
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 9ca34ad8645..a1642fc2229 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -38,7 +38,7 @@ concurrency:
   queue: max
 
 jobs:
-  check_release:
+  check-release:
     runs-on: ubuntu-slim
 
     outputs:
@@ -60,8 +60,8 @@ jobs:
           fi
 
   macos-cpu:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
     strategy:
       matrix:
         include:
@@ -141,8 +141,8 @@ jobs:
           name: llama-bin-macos-${{ matrix.build }}.tar.gz
 
   ubuntu-cpu:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
     strategy:
       matrix:
         include:
@@ -227,8 +227,8 @@ jobs:
           name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
 
   ubuntu-vulkan:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     strategy:
       matrix:
@@ -312,8 +312,8 @@ jobs:
           name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
 
   android-arm64:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: ubuntu-latest
 
@@ -404,8 +404,8 @@ jobs:
           name: llama-bin-android-arm64.tar.gz
 
   ubuntu-24-openvino:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: ubuntu-24.04
 
@@ -501,8 +501,8 @@ jobs:
           name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
 
   windows-cpu:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: windows-2025
 
@@ -569,8 +569,8 @@ jobs:
           name: llama-bin-win-cpu-${{ matrix.arch }}.zip
 
   windows:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: windows-2025
 
@@ -667,8 +667,8 @@ jobs:
           name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
 
   windows-cuda:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: windows-2022
 
@@ -959,8 +959,8 @@ jobs:
 #          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
 
   ubuntu-22-rocm:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: ubuntu-22.04
 
@@ -1079,8 +1079,8 @@ jobs:
           name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
 
   windows-hip:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: windows-2022
 
@@ -1202,11 +1202,9 @@ jobs:
           name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
 
   ios-xcode:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
-    # TODO: figure out how to make this work with macos-26
-    #       https://github.com/ggml-org/llama.cpp/actions/runs/26652714555/job/78604869474
-    runs-on: macos-15
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
+    runs-on: macos-26
 
     steps:
       - name: Checkout code
@@ -1216,7 +1214,7 @@ jobs:
 
       - name: Setup Xcode
         run: |
-          sudo xcode-select -s /Applications/Xcode_16.4.app
+          sudo xcode-select -s /Applications/Xcode_26.4.app
 
       - name: Build
         id: cmake_build
@@ -1232,7 +1230,7 @@ jobs:
             -DLLAMA_BUILD_TESTS=OFF \
             -DLLAMA_BUILD_SERVER=OFF \
             -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=16.0 \
             -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
 
@@ -1354,8 +1352,8 @@ jobs:
 #          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
 
   ui:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
     uses: ./.github/workflows/ui-build.yml
 
   release:
diff --git a/build-xcframework.sh b/build-xcframework.sh
index 1da7b9bda9b..5d289922a84 100755
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -8,6 +8,7 @@ TVOS_MIN_OS_VERSION=16.4
 
 BUILD_SHARED_LIBS=OFF
 LLAMA_BUILD_APP=OFF
+LLAMA_BUILD_COMMON=OFF
 LLAMA_BUILD_EXAMPLES=OFF
 LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
@@ -33,6 +34,7 @@ COMMON_CMAKE_ARGS=(
     -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
     -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
     -DLLAMA_BUILD_APP=${LLAMA_BUILD_APP}
+    -DLLAMA_BUILD_COMMON=${LLAMA_BUILD_COMMON}
     -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
     -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
     -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}

From e674b1279bc9170c870266a11fc2905ef8c8487c Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Sat, 30 May 2026 12:22:38 +0200
Subject: [PATCH 48/50] test: (test-llama-archs) log the config name first
 (#23885)

---
 tests/test-llama-archs.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp
index 3714eaedb0b..1def7faff60 100644
--- a/tests/test-llama-archs.cpp
+++ b/tests/test-llama-archs.cpp
@@ -526,8 +526,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
         max_arch_name_length = std::max(max_arch_name_length, strlen(llm_arch_name(arch)));
     }
 
-    const std::string template_header = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|%15s|%9s|\n";
-    const std::string template_row    = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|%15s %10s|%20s|\n";
+    const std::string template_header  = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|%15s|%9s|\n";
+    const std::string template_row_cfg = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|";
+    const std::string template_row_res = "%15s %10s|%20s|\n";
 
     bool all_ok = true;
     common_log_flush(common_log_main());
@@ -565,6 +566,11 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
             std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_cpu;
             std::vector<float> logits_cpu;
             for (device_config & dc : dev_configs) {
+                // print test config first; should anything fail during model loading or inference, at least we know which test case caused it
+                printf(template_row_cfg.c_str(),
+                    llm_arch_name(arch), dc.label.c_str(), config_name.c_str());
+                fflush(stdout);
+
                 std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_dev;
                 std::vector<float> logits_dev;
                 std::string status_nmse      = "\033[1;33mSKIP\033[0m";
@@ -617,8 +623,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
                     }
                 }
 
-                printf(template_row.c_str(), llm_arch_name(arch), dc.label.c_str(),
-                    config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
+                // log the results for this test case
+                printf(template_row_res.c_str(),
+                    status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
             }
         }
     }

From 2d9b7c8e98e1fc0b768f46ec176756a47738e460 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 30 May 2026 15:26:13 +0300
Subject: [PATCH 49/50] metal : restore im2col implementation for large kernels
 (#23901)

---
 ggml/src/ggml-metal/ggml-metal-device.cpp |   8 +-
 ggml/src/ggml-metal/ggml-metal-ops.cpp    |  24 +++--
 ggml/src/ggml-metal/ggml-metal.metal      | 106 +++++++++++-----------
 tests/test-backend-ops.cpp                |   2 +
 4 files changed, 79 insertions(+), 61 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
index ba006d9b31a..5d4b10d34b9 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -1732,6 +1732,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col(ggml_metal_library_t lib, const ggml_tensor * op) {
     assert(op->op == GGML_OP_IM2COL);
 
+    GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne);
+
     GGML_ASSERT(ggml_is_contiguous(op->src[1]));
     GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
     GGML_ASSERT(op->type         == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
@@ -1739,7 +1741,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col(ggml_meta
     char base[256];
     char name[256];
 
-    snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type));
+    if (ne00*ne01 <= 1024) {
+        snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type));
+    } else {
+        snprintf(base, 256, "kernel_im2col_ext_%s", ggml_type_name(op->type));
+    }
     snprintf(name, 256, "%s", base);
 
     ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index 206af227a2c..e2ce56e9e28 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -3635,16 +3635,26 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
 
     auto pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);
 
-    GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    if (KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N);
 
-    const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N);
+        ggml_metal_encoder_set_pipeline(enc, pipeline);
+        ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
+        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
 
-    ggml_metal_encoder_set_pipeline(enc, pipeline);
-    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
-    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
-    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
+        ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW);
+    } else {
+        const uint64_t n_threads = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), N);
+        const int64_t  quotient  = N / n_threads + (N % n_threads > 0 ? 1 : 0);
+
+        ggml_metal_encoder_set_pipeline(enc, pipeline);
+        ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
+        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
 
-    ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW);
+        ggml_metal_encoder_dispatch_threadgroups(enc, quotient * CHW, OH, OW, n_threads, 1, 1);
+    }
 
     return 1;
 }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index e772664ba91..4adf4614acb 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4696,59 +4696,59 @@ kernel void kernel_im2col(
 template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
 template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
 
-// TODO: obsolete -- remove
-//typedef void (im2col_ext_t)(
-//        constant ggml_metal_kargs_im2col & args,
-//        device const float * x,
-//        device        char * dst,
-//        uint3 tgpig[[threadgroup_position_in_grid]],
-//        uint3  tgpg[[threadgroups_per_grid]],
-//        uint3 tpitg[[thread_position_in_threadgroup]],
-//        uint3   ntg[[threads_per_threadgroup]]);
-//
-//template <typename T>
-//kernel void kernel_im2col_ext(
-//        constant ggml_metal_kargs_im2col & args,
-//        device const float * x,
-//        device        char * dst,
-//        uint3 tgpig[[threadgroup_position_in_grid]],
-//        uint3  tgpg[[threadgroups_per_grid]],      // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
-//        uint3 tpitg[[thread_position_in_threadgroup]],
-//        uint3   ntg[[threads_per_threadgroup]]) {  // [M, 1, 1]
-//    const int64_t KHW = (int64_t)args.KHW;
-//
-//    const int64_t d   = tgpig[0] / args.CHW;
-//    const int64_t chw = tgpig[0] % args.CHW;
-//    const int64_t tgpig_0 = chw / KHW;  // 0 ~ (IC - 1)
-//    const int64_t HW = tgpig[0] % KHW;
-//
-//    const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
-//    if (tpitg_0 >= args.N) {
-//        return;
-//    }
-//
-//    const int64_t tpitg_1 = HW / args.KW;
-//    const int64_t tpitg_2 = HW % args.KW;
-//
-//    const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
-//    const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
-//
-//    const int64_t offset_dst =
-//        (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
-//        (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
-//
-//    device T * pdst = (device T *) (dst);
-//
-//    if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
-//        pdst[offset_dst] = 0.0f;
-//    } else {
-//        const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
-//        pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
-//    }
-//}
-//
-//template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
-//template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
+// TODO: optimize
+typedef void (im2col_ext_t)(
+        constant ggml_metal_kargs_im2col & args,
+        device const float * x,
+        device        char * dst,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tgpg[[threadgroups_per_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]);
+
+template <typename T>
+kernel void kernel_im2col_ext(
+        constant ggml_metal_kargs_im2col & args,
+        device const float * x,
+        device        char * dst,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tgpg[[threadgroups_per_grid]],      // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {  // [M, 1, 1]
+    const int64_t KHW = (int64_t)args.KHW;
+
+    const int64_t d   = tgpig[0] / args.CHW;
+    const int64_t chw = tgpig[0] % args.CHW;
+    const int64_t tgpig_0 = chw / KHW;  // 0 ~ (IC - 1)
+    const int64_t HW = tgpig[0] % KHW;
+
+    const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
+    if (tpitg_0 >= args.N) {
+        return;
+    }
+
+    const int64_t tpitg_1 = HW / args.KW;
+    const int64_t tpitg_2 = HW % args.KW;
+
+    const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
+    const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
+
+    const int64_t offset_dst =
+        (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
+        (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
+
+    device T * pdst = (device T *) (dst);
+
+    if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
+        pdst[offset_dst] = 0.0f;
+    } else {
+        const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
+        pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
+    }
+}
+
+template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
+template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
 
 template <typename TK>
 kernel void kernel_conv_2d(
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 0176599459f..58c5fdd10db 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7812,6 +7812,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {5, 5, 1, 32}, {3, 4, 1, 32}, 1, 1, 0, 0, 1, 1, true));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 1536, 729}, {2, 2, 1536, 4096}, 1, 1, 0, 0, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {128, 128, 1, 2}, {32, 33, 1, 2}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {128, 128, 2, 1}, {33, 34, 2, 1}, 1, 1, 1, 1, 1, 1, true));
 
     // im2col 3D
     test_cases.emplace_back(new test_im2col_3d(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));

From 8b0e0db60645a634f5e686cb4cd3a8ec3f900554 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sat, 30 May 2026 15:48:00 +0200
Subject: [PATCH 50/50] TP: fix granularity for Qwen 3.5/3.6 + 3 GPUs (#23843)

* TP: fix granularity for Qwen 3.5/3.6 + 3 GPUs

* fix afmoe TP
---
 src/llama-model.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index a8323c8fb1e..914fc423b1f 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -410,16 +410,16 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
     auto get_tensor_config = [&]() -> tensor_config {
         // standard attention
         if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_kv_weight)) {
-            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight");
+            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight");
         }
         if (std::regex_match(tensor_name, pattern_q_bias) || std::regex_match(tensor_name, pattern_kv_bias)) {
-            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight");
+            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight", "ssm_out.weight");
         }
         if (std::regex_match(tensor_name, pattern_qkv_weight)) {
-            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1);
+            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight");
         }
         if ( std::regex_match(tensor_name, pattern_qkv_bias)) {
-            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0);
+            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight", "ssm_out.weight");
         }
         if (std::regex_match(tensor_name, pattern_qk_norm)) {
             return get_tensor_config_impl(tensor->ne[1] == 1 ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight");
@@ -435,7 +435,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
         }
 
         if (std::regex_match(tensor_name, pattern_attn_gate_weight)) {
-            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1);
+            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight");
         }
         if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a)) {
             return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ssm_out.weight");