diff --git a/cpp/common/build-info.cpp b/cpp/common/build-info.cpp
index adbb3486..d0859d50 100644
--- a/cpp/common/build-info.cpp
+++ b/cpp/common/build-info.cpp
@@ -3,8 +3,8 @@
 #include <cstdio>
 #include <string>
 
-int LLAMA_BUILD_NUMBER = 9254;
-char const * LLAMA_COMMIT = "e947228";
+int LLAMA_BUILD_NUMBER = 9297;
+char const * LLAMA_COMMIT = "b0df4c0";
 char const * LLAMA_COMPILER = "unknown";
 char const * LLAMA_BUILD_TARGET = "unknown";
 
diff --git a/cpp/common/chat.h b/cpp/common/chat.h
index 92e5544f..497d08c6 100644
--- a/cpp/common/chat.h
+++ b/cpp/common/chat.h
@@ -222,6 +222,7 @@ struct common_chat_parser_params {
     bool                    reasoning_in_content = false;
     std::string             generation_prompt;
     bool                    parse_tool_calls     = true;
+    bool                    is_continuation      = false;
     bool                    echo                 = false;  // Include assistant prefilled msg in output
     bool                    debug                = false;  // Enable debug output for PEG parser
     common_peg_arena        parser               = {};
diff --git a/cpp/common/common.h b/cpp/common/common.h
index 5ab439cd..7218c76c 100644
--- a/cpp/common/common.h
+++ b/cpp/common/common.h
@@ -306,6 +306,8 @@ struct common_params_speculative_draft {
     float p_split = 0.1f; // speculative decoding split probability
     float p_min   = 0.0f; // minimum speculative decoding probability (greedy)
 
+    bool backend_sampling = true; // offload draft sampling to the backend (default: on)
+
     common_params_model mparams;
 
     llama_context * ctx_tgt = nullptr;
diff --git a/cpp/common/speculative.cpp b/cpp/common/speculative.cpp
index de2a124a..e2bb4918 100644
--- a/cpp/common/speculative.cpp
+++ b/cpp/common/speculative.cpp
@@ -33,16 +33,15 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro
 };
 
 static std::string common_speculative_get_devices_str(const std::vector<lm_ggml_backend_dev_t> & devices) {
-    if (devices.empty()) {
-        return "default";
-    }
-
     std::string result;
     for (size_t i = 0; i < devices.size(); i++) {
-        if (i > 0) result += ", ";
+        if (devices[i] == nullptr) {
+            continue;
+        }
+        if (!result.empty()) result += ", ";
         result += lm_ggml_backend_dev_name(devices[i]);
     }
-    return result;
+    return result.empty() ? "default" : result;
 }
 
 struct common_speculative_config {
@@ -414,6 +413,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
 
     std::vector<common_sampler_ptr> smpls;
 
+    // backend sampler chain per seq, attached to ctx_dft
+    std::vector<llama_sampler *> backend_chains;
+
     int32_t n_embd = 0;
 
     // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
@@ -445,7 +447,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
         n_embd = llama_model_n_embd(llama_get_model(ctx_dft));
 
         LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
-        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd);
+        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
         LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
                 this->params.n_gpu_layers,
                 lm_ggml_type_name(this->params.cache_type_k),
@@ -469,6 +471,22 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
             s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
         }
 
+        // offload draft sampling to the backend
+        backend_chains.assign(n_seq, nullptr);
+        if (this->params.backend_sampling) {
+            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params());
+                llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));
+
+                if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
+                    LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
+                    llama_sampler_free(chain);
+                    chain = nullptr;
+                }
+                backend_chains[seq_id] = chain;
+            }
+        }
+
         llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false);
         llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);
 
@@ -484,6 +502,18 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
     }
 
     ~common_speculative_impl_draft_mtp() override {
+        auto * ctx_dft = this->params.ctx_dft;
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) {
+            if (backend_chains[seq_id] == nullptr) {
+                continue;
+            }
+            if (ctx_dft) {
+                llama_set_sampler(ctx_dft, seq_id, nullptr);
+            }
+            llama_sampler_free(backend_chains[seq_id]);
+        }
+        backend_chains.clear();
+
         if (batch.token != nullptr) {
             free(batch.token);
             batch.token = nullptr;
diff --git a/cpp/ggml-backend.cpp b/cpp/ggml-backend.cpp
index 432b1422..f008d798 100644
--- a/cpp/ggml-backend.cpp
+++ b/cpp/ggml-backend.cpp
@@ -306,7 +306,7 @@ void lm_ggml_backend_tensor_get_2d_async(lm_ggml_backend_t backend, const struct
     LM_GGML_ASSERT(tensor);
     LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
 
-    if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
+    if (n_copies <= 1 || backend->iface.get_tensor_2d_async == NULL) {
         for (size_t i = 0; i < n_copies; i++) {
             lm_ggml_backend_tensor_get_async(backend, tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
         }
@@ -317,7 +317,7 @@ void lm_ggml_backend_tensor_get_2d_async(lm_ggml_backend_t backend, const struct
     }
 
     LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    LM_GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
+    LM_GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
     backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
 }
 
@@ -379,7 +379,7 @@ void lm_ggml_backend_tensor_get_2d(const struct lm_ggml_tensor * tensor, void *
     lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
     LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
 
-    if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) {
+    if (n_copies <= 1 || buf->iface.get_tensor_2d == NULL) {
         for (size_t i = 0; i < n_copies; i++) {
             lm_ggml_backend_tensor_get(tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
         }
diff --git a/cpp/ggml-hexagon/ggml-hexagon.cpp b/cpp/ggml-hexagon/ggml-hexagon.cpp
index 1182363d..e788e92e 100644
--- a/cpp/ggml-hexagon/ggml-hexagon.cpp
+++ b/cpp/ggml-hexagon/ggml-hexagon.cpp
@@ -2735,9 +2735,10 @@ static bool lm_ggml_hexagon_supported_ssm_conv(const struct lm_ggml_hexagon_sess
     if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
         return false;
     }
-
-    // TODO: add support for non-contiguous tensors
-    if (!lm_ggml_is_contiguous(src0) || !lm_ggml_is_contiguous(src1) || !lm_ggml_is_contiguous(dst)) {
+    if (src0->nb[0] != sizeof(float) || src1->nb[0] != sizeof(float) || dst->nb[0] != sizeof(float)) {
+        return false;
+    }
+    if (src0->nb[1] != src0->ne[0] * sizeof(float) || src1->nb[1] != src1->ne[0] * sizeof(float)) {
         return false;
     }
 
diff --git a/cpp/ggml-hexagon/htp/hmx-matmul-ops.c b/cpp/ggml-hexagon/htp/hmx-matmul-ops.c
index ca3be4e6..fcb35279 100644
--- a/cpp/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/cpp/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -201,11 +201,10 @@ static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(const uint8_t *packed_32
 
 // Batch-dequantize 4 contiguous x4x2 Q4_0 groups (4x32 = 128 packed bytes) using
 // full HVX vector width.  One vmemu + one vlut16 replaces 4 separate calls.
-// Output: out[0..3] each hold 32 FP16 values in the first 64 bytes.
-static inline void dequantize_x4x2_q4_0_x4groups_hvx(
+// Output: vector_x2 each hold 32 FP16 values in the first 64 bytes.
+static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx(
             const uint8_t *packed_128, bool upper_nibbles,
-            const __fp16 *scales_4, const HVX_Vector vlut_cvt,
-            HVX_Vector out[4]) {
+            const __fp16 *scales_4, const HVX_Vector vlut_cvt) {
     // Load all 128 packed bytes (4 contiguous 32-byte groups)
     HVX_Vector vq = hvx_vmemu(packed_128);
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
@@ -221,8 +220,7 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
     HVX_Vector v_hi = Q6_V_hi_W(vp);  // [group2: 32 fp16 | group3: 32 fp16]
 
     // Build per-group scale vectors: first 64 bytes use scale_a, last 64 use scale_b
-    volatile HVX_Vector vscale = hvx_vmemu(scales_4);
-
+    HVX_Vector vscale = hvx_vmemu(scales_4);
     HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
     HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));
 
@@ -230,8 +228,9 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
     v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
 
     // Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter
-    out[0] = v_lo; // group0 already in [0:63]
-    out[1] = v_hi; // group2 already in [0:63]
+    HVX_Vector_x2 r = { v_lo,/* group1 already in [0:63] */
+                        v_hi /* group2 already in [0:63] */ };
+    return r;
 }
 
 // Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes.
@@ -292,12 +291,11 @@ static inline HVX_Vector dequantize_x4x2_mxfp4_group_hvx(const uint8_t *  packed
 }
 
 // Batch-dequantize 4 contiguous x4x2 MXFP4 groups (4x32 = 128 packed bytes).
-static inline void dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t *  packed_128,
+static inline HVX_Vector_x4 dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t *  packed_128,
                                                       bool             upper_nibbles,
                                                       int              sub_blk_base,
                                                       const HVX_Vector vlut_cvt,
-                                                      mxfp4_scales_t   scales,
-                                                      HVX_Vector       out[4]) {
+                                                      mxfp4_scales_t   scales) {
     HVX_Vector       vq       = hvx_vmemu(packed_128);
     const HVX_Vector mask_h4  = Q6_Vb_vsplat_R(0x0F);
     HVX_Vector       v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
@@ -318,10 +316,8 @@ static inline void dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t *  packed_12
     v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
     v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
 
-    out[0] = v_lo;
-    out[1] = Q6_V_vror_VR(v_lo, 64);
-    out[2] = v_hi;
-    out[3] = Q6_V_vror_VR(v_hi, 64);
+    HVX_Vector_x4 r = { v_lo, Q6_V_vror_VR(v_lo, 64), v_hi, Q6_V_vror_VR(v_hi, 64) };
+    return r;
 }
 
 // Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16.
@@ -372,18 +368,18 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
             unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
 
             for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                HVX_Vector v0[2];
                 const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
-                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
-                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
-                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+                const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
 
+                HVX_Vector_x2 dv0 = dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
+                HVX_Vector_x2 dv1 = dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);
+
+                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);
+                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
 
-                r0 = vtcm_src + row_offset; row_offset += row_stride;
-                dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
-                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
-                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
+                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);
+                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);
                 v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
             }
 
@@ -415,21 +411,21 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                 // Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
                 mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
 
-                HVX_Vector v0[4], v1[4];
-                dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8, v0);
+                HVX_Vector_x4 dv0, dv1;
+                dv0 = dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8);
                 if (row1 < n_cols) {
                     mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
-                    dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8, v1);
+                    dv1 = dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8);
                 } else {
-                    v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero();
+                    dv1.v[0] = dv1.v[1] = dv1.v[2] = dv1.v[3] = Q6_V_vzero();
                 }
 
                 for (int g = 0; g < 4; g++) {
-                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]);
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[g]);
                 }
                 v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
                 for (int g = 0; g < 4; g++) {
-                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]);
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[g]);
                 }
                 v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
             }
@@ -612,11 +608,13 @@ static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict
             const __fp16 *row_tiles = activation + r * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
             const __fp16 *col_tiles = weight + c * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
 
-            for (int k = 0; k < n_dot_tiles; ++k) {
-                Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
-                Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
-                row_tiles += HMX_FP16_TILE_N_ELMS;
-                col_tiles += HMX_FP16_TILE_N_ELMS;
+            for (int k = 0, k_block; k < n_dot_tiles; k += k_block) {
+                k_block = hex_smin(n_dot_tiles - k, 32);
+                const uint32_t range = 2048u * (uint32_t)k_block - 1;
+                Q6_activation_hf_mxmem_RR_deep((unsigned int)row_tiles, range);
+                Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, range);
+                row_tiles += k_block * HMX_FP16_TILE_N_ELMS;
+                col_tiles += k_block * HMX_FP16_TILE_N_ELMS;
             }
 
             __fp16 *out_tile = output + (r * n_col_tiles + c) * HMX_FP16_TILE_N_ELMS;
@@ -832,10 +830,6 @@ static void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *
     worker_pool_run_func(ctx->worker_pool, transfer_activation_chunk_worker_fn, &state, ctx->n_threads);
 }
 
-//
-
-#define FALLBACK_TO_STANDARD 1
-
 // C += AB
 static void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __fp16 *restrict b,
                                 const __fp16 *restrict col_scales, const __fp16 *restrict eye_tile,
@@ -861,314 +855,80 @@ static void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, co
                 Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047);
             }
 
-            for (int k = 0; k < n_dot_tiles; ++k) {
-                Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
-                Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
-                row_tiles += HMX_FP16_TILE_N_ELMS;
-                col_tiles += HMX_FP16_TILE_N_ELMS;
-            }
-            Q6_mxmem_AR_after_hf(accum_tile, 0);
-        }
-    }
-}
-
-static __attribute__((noinline)) int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx,
-                                       float *restrict out, const float *restrict x, const uint8_t *restrict w,
-                                       int m, int k, int n, int weight_type) {
-    // assume k % 32 == 0 && n % 32 == 0
-    const size_t row_stride = get_x4x2_row_stride(weight_type, k);
-    if (row_stride == 0) {
-        return -1;
-    }
-
-    const size_t vtcm_budget = ctx->vtcm_size;
-
-    const size_t K_BLOCK_SIZE = 1024;
-
-    // Fallback: if k doesn't need K-blocking, out-stationary has no advantage
-    const size_t k_iters_check = (k + K_BLOCK_SIZE - 1) / K_BLOCK_SIZE;
-    if (k_iters_check <= 1) {
-        FARF(HIGH, "%s: K_BLK=%zu >= k=%d, fallback to standard path", __func__, K_BLOCK_SIZE, k);
-        return FALLBACK_TO_STANDARD;
-    }
-
-    // Dynamic M,N search via hmx_compute_chunks
-    const size_t sub_row_stride_alloc = get_x4x2_row_stride(weight_type, K_BLOCK_SIZE);
-    const size_t per_m  = K_BLOCK_SIZE * sizeof(float)   // scratch1: M×K×4 (act DMA staging F32)
-                        + K_BLOCK_SIZE * sizeof(__fp16); // activation: M×K×2 (F16 tiles)
-    const size_t per_n  = sub_row_stride_alloc           // scratch0: N×sub_row(K) (packed quant)
-                        + K_BLOCK_SIZE * sizeof(__fp16); // weight: N×K×2 (F16 tiles)
-    const size_t per_mn = sizeof(__fp16);                // output: M×N×2 (out-stationary)
-
-    // Alignment margin: hex_align_up can add up to 2047 bytes per buffer;
-    // scratch1 (mc×6144) is naturally 2048-aligned, remaining 4 buffers need margin
-    const size_t align_margin = 4 * HMX_FP16_TILE_SIZE;
-    const size_t overhead     = HMX_FP16_TILE_SIZE + 256 + align_margin;  // eye_tile + scales + alignment
-
-    size_t       M_BLOCK_SIZE, N_BLOCK_SIZE, vtcm_used;
-    // Cost-based search: minimize ceil(m/mc)*m_block_cost + ceil(n/nc)*n_block_cost.
-    // From profiling: wt_dequant per element ≈ 1.5× activation load per element.
-    // m_block_cost = n*3: each extra M-block re-dequants all N×K weight (expensive).
-    // n_block_cost = m*2: each extra N-block re-loads all M×K activation (cheaper).
-    const size_t m_block_cost = (size_t) n * 3;
-    const size_t n_block_cost = (size_t) m * 2;
-    if (hmx_compute_chunks(vtcm_budget, overhead, per_n, per_m, per_mn,
-                           hex_align_up(m, HMX_FP16_TILE_N_ROWS), n,
-                           m_block_cost, n_block_cost, &M_BLOCK_SIZE,
-                           &N_BLOCK_SIZE, &vtcm_used) != 0) {
-        FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
-        return -1;
-    }
-
-    // Compute precise buffer sizes from searched M,N and fixed K
-    const size_t weight_size  = hex_align_up(N_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
-    const size_t act_size     = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
-    const size_t out_size     = hex_align_up(M_BLOCK_SIZE * N_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
-    const size_t scratch0_sz  = hex_align_up(N_BLOCK_SIZE * sub_row_stride_alloc, HMX_FP16_TILE_SIZE);
-    const size_t scratch1_sz  = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(float), HMX_FP16_TILE_SIZE);
-
-    const size_t total_vtcm = weight_size + act_size + out_size + scratch0_sz + scratch1_sz + HMX_FP16_TILE_SIZE + 256;
-    if (total_vtcm > vtcm_budget) {
-        FARF(HIGH, "%s: VTCM overflow after search: need %zu have %zu (M=%zu N=%zu K=%zu)", __func__, total_vtcm,
-                    vtcm_budget, M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE);
-        return -1;
-    }
-
-    uint8_t *vtcm_ptr        = (uint8_t *) ctx->vtcm_base;
-    __fp16  *vtcm_weight     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_size);
-    __fp16  *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_size);
-    __fp16  *vtcm_output     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, out_size);
-    uint8_t *vtcm_scratch0   = vtcm_seq_alloc(&vtcm_ptr, scratch0_sz);
-    uint8_t *vtcm_scratch1   = vtcm_seq_alloc(&vtcm_ptr, scratch1_sz);
-    __fp16  *vtcm_eye_tile   = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, HMX_FP16_TILE_SIZE);
-    __fp16  *vtcm_scales     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
-    assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget);
-
-    FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", m, k, n, weight_type,
-         M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);
-
-    // initialize eye tile (32x32 identity matrix)
-    {
-        HVX_Vector v;
-        v = Q6_V_vzero();
-        v = Q6_Vw_vinsert_VwR(v, 0x3c000000);
-        v = Q6_V_vror_VR(v, VLEN - 4);
-        v = Q6_Vw_vinsert_VwR(v, 0x00003c00);
-        for (int i = 0; i < 16; ++i) {
-            ((HVX_Vector *) vtcm_eye_tile)[i] = v;
-            v = Q6_V_vror_VR(v, VLEN - 8);
-        }
-    }
-    hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00));  // scale: 1.0, bias: 0.0 in FP16
-
-    TIMER_DEFINE(fetch);
-    TIMER_DEFINE(act_load);
-    TIMER_DEFINE(wt_dequant);
-    TIMER_DEFINE(core);
-
-    HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
-
-    for (size_t mr = 0; mr < m; mr += M_BLOCK_SIZE) {
-        size_t m_blk_sz = hex_smin(m - mr, M_BLOCK_SIZE);
-        for (size_t nc = 0; nc < n; nc += N_BLOCK_SIZE) {
-            size_t n_blk_sz = hex_smin(n - nc, N_BLOCK_SIZE);
-
-            const int n_row_tiles = hmx_ceil_div(m_blk_sz, HMX_FP16_TILE_N_ROWS);
-            const int n_col_tiles = hmx_ceil_div(n_blk_sz, HMX_FP16_TILE_N_COLS);
-
-            for (size_t kk = 0; kk < k; kk += K_BLOCK_SIZE) {
-                const size_t k_blk_sz = hex_smin(k - kk, K_BLOCK_SIZE);
-
-                TIMER_START(fetch);
-                // fetch activation block into VTCM
-                {
-                    const float *activation_block = x + mr * k + kk;
-
-                    dma_queue_push(ctx->dma[0],
-                                     dma_make_ptr(vtcm_scratch1, activation_block),
-                                     k_blk_sz * sizeof(float),
-                                     k * sizeof(float),
-                                     k_blk_sz * sizeof(float),
-                                     m_blk_sz);
-                }
-
-                // fetch weight block into VTCM (x4x2 sub-block: quants + scales)
-                const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz);
-                {
-                    const int blk_start       = kk / QK_Q4_0x4x2;
-                    const int nb_sub          = (k_blk_sz + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2;
-                    const int  full_qrow      = (weight_type == HTP_TYPE_Q8_0) ? k : (k / 2);
-                    const int  scale_blk_size = (weight_type == HTP_TYPE_MXFP4) ? HMX_X4X2_MXFP4_EBLK_SIZE : HMX_X4X2_DBLK_SIZE;
-                    uint8_t       *dst        = vtcm_scratch0;
-                    const uint8_t *src        = w + nc * row_stride;
-                    const size_t  n_rows      = n_blk_sz;
-                    const size_t  src_stride  = row_stride;
-                    const size_t  dst_stride  = sub_row_stride;
-                    const size_t  quant_off   = (weight_type == HTP_TYPE_Q8_0) ? (blk_start * QK_Q8_0x4x2) : (blk_start * (QK_Q4_0x4x2 / 2));
-                    const size_t  quant_width = (weight_type == HTP_TYPE_Q8_0) ? (nb_sub    * QK_Q8_0x4x2) : (nb_sub    * (QK_Q4_0x4x2 / 2));
-                    const size_t  scale_off   = full_qrow + blk_start * scale_blk_size;
-                    const size_t  scale_width = nb_sub * scale_blk_size;
-
-                    // 2D DMA: quants sub-range
-                    dma_queue_push(ctx->dma[0], dma_make_ptr(dst, src + quant_off), dst_stride, src_stride, quant_width, n_rows);
-                    // 2D DMA: scales sub-range
-                    dma_queue_push(ctx->dma[0], dma_make_ptr(dst + quant_width, src + scale_off), dst_stride, src_stride, scale_width, n_rows);
-                }
-                TIMER_STOP(fetch);
-
-                TIMER_START(act_load);
-                // load activation block
-                {
-                    dma_queue_pop(ctx->dma[0]); // wait for act DNA
-                    transfer_activation_chunk_threaded(ctx, vtcm_activation, (float *) vtcm_scratch1, m_blk_sz, k_blk_sz, k_blk_sz);
-                }
-                TIMER_STOP(act_load);
-
-                TIMER_START(wt_dequant);
-                // dequantize weight block
-                {
-                    dma_queue_pop(ctx->dma[0]);
-                    dma_queue_pop(ctx->dma[0]);
-                    // vtcm_scratch0 is used to store the qweight chunk
-                    // worker_pool_run_func already returned, so fetch is done
-                    dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight, vtcm_scratch0,
-                                                                n_blk_sz, k_blk_sz, sub_row_stride, weight_type);
-                }
-                TIMER_STOP(wt_dequant);
-
-                // core mma
-                TIMER_START(core);
-                {
-                    core_mma_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, vtcm_eye_tile, n_row_tiles,
-                                        n_col_tiles, k_blk_sz / HMX_FP16_TILE_N_COLS, kk == 0);
-                }
-                TIMER_STOP(core);
+            for (int k = 0, k_block; k < n_dot_tiles; k += k_block) {
+                k_block = hex_smin(n_dot_tiles - k, 32);
+                const uint32_t range = 2048u * (uint32_t)k_block - 1;
+                Q6_activation_hf_mxmem_RR_deep((unsigned int)row_tiles, range);
+                Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, range);
+                row_tiles += k_block * HMX_FP16_TILE_N_ELMS;
+                col_tiles += k_block * HMX_FP16_TILE_N_ELMS;
             }
 
-            // store output block
-            {
-                float *output_block = out + (mr * n + nc);
-                transfer_output_chunk_threaded(ctx, output_block, vtcm_output, m_blk_sz, n_blk_sz, n);
-            }
+            Q6_mxmem_AR_after_hf(accum_tile, 0);
         }
     }
-
-    HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
-
-#if defined(ENABLE_PROFILE_TIMERS)
-    FARF(HIGH, "fetch: %lld us, act_load: %lld us, wt_dequant: %lld us, core: %lld us",
-         TIMER_US(fetch), TIMER_US(act_load), TIMER_US(wt_dequant), TIMER_US(core));
-#endif
-    return 0;
 }
 
-int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
+int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
                                      const uint8_t *restrict permuted_weight, int m, int k, int n,
                                      int weight_type) {
-    if (!dst || !activation || !permuted_weight || !m || !n || !k) { return -1; }
     if (k % 32 != 0 || n % 32 != 0) { return -1; }
 
     if (!hex_is_aligned(dst, VLEN) || !hex_is_aligned(activation, VLEN) || !hex_is_aligned(permuted_weight, VLEN)) {
         return -1;
     }
 
-    // for large m, k (e.g. prefill FFN Down), use out-stationary version
-    if (m >= 128 && k > n && n > 1024) {
-        int rc = mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
-        if (rc != FALLBACK_TO_STANDARD) {
-            return rc;  // 0 success, -1 error
-        }
-        FARF(HIGH, "hmx_matmul_qk: out-stationary fallback to standard m=%d k=%d n=%d", m, k, n);
-        // fall through to standard path
-    }
-
     size_t row_stride = get_x4x2_row_stride(weight_type, k);
     if (row_stride == 0) {
         return -1;
     }
 
-    FARF(HIGH, "hmx_matmul_qk: STANDARD path m=%d k=%d n=%d type=%d", m, k, n, weight_type);
-
     // --- Dynamic VTCM layout ---
-    const size_t vtcm_budget   = ctx->vtcm_size;
-    const size_t vec_dot_size  = k * sizeof(__fp16);
+    const size_t vec_dot_size = k * sizeof(__fp16);
+    const size_t vtcm_budget  = ctx->vtcm_size;
+    size_t vtcm_used = 0;
 
     // Pipeline = 4-stage DMA→dequant→HMX→store with HMX worker overlap.
-    // Only pays off when the chunker yields >=2 n-chunks, so the main loop can
-    // overlap HMX (C) with HVX (B/D); with a single n-chunk the extra VTCM for
-    // double-buffered output and the worker-dispatch overhead are pure loss.
-    // Try pipeline costs first; fall back to sequential if the layout collapses
-    // to one n-chunk. m >= 128 floor keeps HMX utilization reasonable.
-    const size_t pipe_per_n  = row_stride + 2 * vec_dot_size;  // Q + S0 + S1 (dequant bufs)
-    const size_t pipe_per_mn = 2 * sizeof(__fp16);             // O x 2 (output double buffer)
-    const size_t seq_per_n   = vec_dot_size + 2 * row_stride;  // W + S0 + S1 (x4x2 DMA bufs)
-    const size_t seq_per_mn  = sizeof(__fp16);                 // O x 1
-
-    size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
-    bool   use_pipeline = false;
-
-    if (m >= 128) {
-        size_t mc = 0, nc = 0, used = 0;
-        if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, pipe_per_n, /*per_m=*/vec_dot_size, pipe_per_mn,
-                               hex_align_up(m, HMX_FP16_TILE_N_ROWS), n,
-                               /*m_block_cost=*/(size_t) n * 3,
-                               /*n_block_cost=*/(size_t) m * 2, &mc, &nc, &used) == 0 &&
-            hmx_ceil_div((size_t) n, nc) >= 2) {
-            m_chunk_n_rows = mc;
-            n_chunk_n_cols = nc;
-            vtcm_used      = used;
-            use_pipeline   = true;
-        }
-    }
+    const size_t size_per_n  = row_stride + 2 * vec_dot_size;  // Q + S0 + S1 (dequant bufs)
+    const size_t size_per_mn = 2 * sizeof(__fp16);             // O x 2 (output double buffer)
 
-    if (!use_pipeline) {
-        if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, seq_per_n, /*per_m=*/vec_dot_size, seq_per_mn,
-                               hex_align_up(m, HMX_FP16_TILE_N_ROWS), n,
-                               /*m_block_cost=*/(size_t) n * 3,
-                               /*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
-            FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
-            return -1;
-        }
+    size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0;
+    if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, size_per_n, /*per_m=*/vec_dot_size, size_per_mn,
+                           hex_align_up(m, HMX_FP16_TILE_N_ROWS), n,
+                           /*m_block_cost=*/(size_t) n * 3,
+                           /*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used)) {
+        FARF(HIGH, "hmx-mm-q: VTCM too small : m %d k %d n %d budget %zu", m, k, n, vtcm_budget);
+        return -1;
     }
 
-    // Compute precise buffer sizes per execution path
-    const size_t weight_area_size = hex_align_up(
-        n_chunk_n_cols * (use_pipeline ? row_stride : vec_dot_size), HMX_FP16_TILE_SIZE);
-    const size_t activation_area_size = hex_align_up(m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE);
-    const size_t output_area_size = hex_align_up(
-        m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE);
+    const size_t weight_area_size = hex_align_up(n_chunk_n_cols * row_stride,   HMX_FP16_TILE_SIZE);
+    const size_t act_area_size    = hex_align_up(m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE);
+    const size_t output_area_size = hex_align_up(m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE);
 
     size_t scratch0_size, scratch1_size, scratch2_size;
-    if (use_pipeline) {
-        scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE);  // dequant buf 0
-        scratch1_size = scratch0_size;                                                    // dequant buf 1
-        scratch2_size = output_area_size;                                                 // output buf 1
-    } else {
-        scratch0_size = hex_align_up(n_chunk_n_cols * row_stride, HMX_FP16_TILE_SIZE);    // x4x2 DMA buf 0
-        scratch1_size = scratch0_size;                                                    // x4x2 DMA buf 1
-        scratch2_size = 0;                                                                // unused
-    }
+    scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE);  // dequant buf 0
+    scratch1_size = scratch0_size;                                                    // dequant buf 1
+    scratch2_size = output_area_size;                                                 // output  buf 1
 
     uint8_t *vtcm_ptr        = (uint8_t *) ctx->vtcm_base;
     __fp16  *vtcm_weight     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size);
-    __fp16  *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, activation_area_size);
+    __fp16  *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_area_size);
     __fp16  *vtcm_output     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size);
     void    *vtcm_scratch0   = vtcm_seq_alloc(&vtcm_ptr, scratch0_size);
     void    *vtcm_scratch1   = vtcm_seq_alloc(&vtcm_ptr, scratch1_size);
     void    *vtcm_scratch2   = scratch2_size ? vtcm_seq_alloc(&vtcm_ptr, scratch2_size) : NULL;
     __fp16  *vtcm_scales     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
-    if ((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) > vtcm_budget) {
-        FARF(ERROR, "%s: vtcm overflow: used=%zu limit=%zu", __func__,
-             (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
+
+    vtcm_used = vtcm_ptr - (uint8_t *) ctx->vtcm_base;
+    if (vtcm_used > vtcm_budget) {
+        FARF(ERROR, "hmx-mm-q: VTCM overflow: used %zu budget %zu", vtcm_used, vtcm_budget);
         return -1;
     }
 
     hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00));  // scale: 1.0, bias: 0.0 in FP16
 
-    FARF(HIGH, "%s: m=%d k=%d n=%d wtype=%d pipe=%d mc=%zu nc=%zu vtcm=%zu/%zu",
-         __func__, m, k, n, weight_type, use_pipeline,
-         m_chunk_n_rows, n_chunk_n_cols,
-         (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
+    FARF(HIGH, "hmx-mm-q: standard : m %d k %d n %d wtype %d mc %zu nc %zu vtcm %zu/%zu",
+         m, k, n, weight_type, m_chunk_n_rows, n_chunk_n_cols, vtcm_used, vtcm_budget);
 
     TIMER_DEFINE(activation_load);
     TIMER_DEFINE(weight_load);
@@ -1178,184 +938,115 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
     TIMER_DEFINE(total);
     TIMER_START(total);
 
-    FARF(HIGH, "hmx_matmul_qk: %s mc=%zu nc=%zu vtcm=%zu/%zu",
-         use_pipeline ? "PIPELINE" : "SEQUENTIAL", m_chunk_n_rows, n_chunk_n_cols,
-         (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
+    // 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D)
+    // HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D).
 
-    if (!use_pipeline) {
-        HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
-        for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
-            // transfer activation matrix chunk into VTCM
-            const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
-            const size_t n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS);
+    // A --> B: vtcm_qweight, 1 buffer
+    // B --> C: vtcm_weight0/vtcm_weight1, 2 buffers
+    // C --> D: vtcm_output0/vtcm_output1, 2 buffers
 
-            TIMER_START(activation_load);
-            {
-                const float *activation_chunk = activation + mr * k;
-                transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
-            }
-            TIMER_STOP(activation_load);
+    // Async timeline (C overlaps B+D):
+    //   main+HVX:   [A0][Act][B0][A1][sub C0][B1‖C0][A2][wait,sub C1][D0+B2‖C1][wait,sub C2][D1‖C2][wait][D2]
+    //   HMX queue:                   [████ C0 ████████][████ C1 ████████████][████ C2 ████████]
 
-            void *buf_curr = vtcm_scratch0;
-            void *buf_next = vtcm_scratch1;
-
-            {
-                const size_t n_cols_first = hex_smin(n, n_chunk_n_cols);
-                dma_queue_push(ctx->dma[0], dma_make_ptr(buf_curr, permuted_weight), row_stride, row_stride, row_stride, n_cols_first);
-            }
-
-            for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) {
-                const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
-                const size_t n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS);
-
-                TIMER_START(weight_load);
-                {
-                    dma_queue_pop(ctx->dma[0]);  // wait until current weight chunk become ready
-
-                    const size_t nc_next = nc + n_chunk_n_cols;
-                    if (nc_next < n) {
-                        const size_t n_cols_next = hex_smin(n - nc_next, n_chunk_n_cols);
-
-                        const uint8_t *next_weight_chunk = permuted_weight + nc_next * row_stride;
-
-                        dma_queue_push(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk), row_stride, row_stride, row_stride, n_cols_next);
-                    }
+    int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);
+    hmx_matmul_job_t job_slots[2];  // persistent double-buffered job descriptors
 
-                    // Dequant + vscatter writes directly to [K, N] transposed tiles.
-                    // HMX computes C = A x B, where A=[M,K] activation, B=[K,N] weight.
-                    dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight, buf_curr, n_cols, k, row_stride, weight_type);
+    for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
+        const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
 
-                    hex_swap_ptr(&buf_curr, &buf_next);
-                }
-                TIMER_STOP(weight_load);
+        void *vtcm_qweight        = vtcm_weight;
+        void *vtcm_weight_bufs[2] = { vtcm_scratch0, vtcm_scratch1 };
+        void *vtcm_output_bufs[2] = { vtcm_output,   vtcm_scratch2 };
 
-                TIMER_START(hmx_core);
-                {
-                    core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, k / 32);
-                }
-                TIMER_STOP(hmx_core);
-
-                TIMER_START(output_store);
-                {
-                    float *output = dst + (mr * n + nc);
-                    transfer_output_chunk_threaded(ctx, output, vtcm_output, n_rows, n_cols, n);
-                }
-                TIMER_STOP(output_store);
-            }
+        // prologue: A0
+        const size_t n_cols_A0 = hex_smin(n - 0 * n_chunk_n_cols, n_chunk_n_cols);
+        {
+            const uint8_t *qweight_chunk_A0 = permuted_weight;
+            dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A0), row_stride, row_stride, row_stride, n_cols_A0);
         }
-        HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
-    } else {
-        // 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D)
-        // HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D).
-
-        // A --> B: vtcm_qweight, 1 buffer
-        // B --> C: vtcm_weight0/vtcm_weight1, 2 buffers
-        // C --> D: vtcm_output0/vtcm_output1, 2 buffers
-
-        // Async timeline (C overlaps B+D):
-        //   main+HVX:   [A0][Act][B0][A1][sub C0][B1‖C0][A2][wait,sub C1][D0+B2‖C1][wait,sub C2][D1‖C2][wait][D2]
-        //   HMX queue:                   [████ C0 ████████][████ C1 ████████████][████ C2 ████████]
-
-        int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);
-        hmx_matmul_job_t job_slots[2];  // persistent double-buffered job descriptors
 
-        for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
-            const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
-
-            void *vtcm_qweight        = vtcm_weight;
-            void *vtcm_weight_bufs[2] = { vtcm_scratch0, vtcm_scratch1 };
-            void *vtcm_output_bufs[2] = { vtcm_output, vtcm_scratch2 };
+        {
+            const float *activation_chunk = activation + mr * k;
+            transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
+        }
 
-            // prologue: A0
-            const size_t n_cols_A0 = hex_smin(n - 0 * n_chunk_n_cols, n_chunk_n_cols);
-            {
-                // Use 2D DMA (n_cols rows x row_stride) to avoid 16-bit roiwidth overflow.
-                const uint8_t *qweight_chunk_A0 = permuted_weight;
-                dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A0), row_stride, row_stride, row_stride, n_cols_A0);
+        // prologue: B0, A1, submit C0 (async), B1 (overlaps C0)
+        {
+            // B0: wait for DMA, dequant weight chunk 0
+            dma_queue_pop(ctx->dma[0]);
+            dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
+
+            // A1: issue DMA for weight chunk 1
+            const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
+            if (1 < n_chunk_cnt) {
+                const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride;
+                dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
             }
 
-            {
-                const float *activation_chunk = activation + mr * k;
-                transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
-            }
+            // submit C0 (non-blocking — HMX worker executes in parallel)
+            hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation,
+                                (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
+                                hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
+                                hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+            hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0]));
 
-            // prologue: B0, A1, submit C0 (async), B1 (overlaps C0)
-            {
-                // B0: wait for DMA, dequant weight chunk 0
+            // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
+            if (1 < n_chunk_cnt) {
                 dma_queue_pop(ctx->dma[0]);
-                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
-
-                // A1: issue DMA for weight chunk 1
-                const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
-                if (1 < n_chunk_cnt) {
-                    const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride;
-                    dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
-                }
-
-                // submit C0 (non-blocking — HMX worker executes in parallel)
-                hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation,
-                                    (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
-                                    hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
-                                    hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
-                hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0]));
-
-                // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
-                if (1 < n_chunk_cnt) {
-                    dma_queue_pop(ctx->dma[0]);
-                    dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
-                }
+                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
             }
+        }
 
-            // main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1})
-            for (int i = 0; i < n_chunk_cnt; ++i) {
-                const size_t nc    = i * n_chunk_n_cols;
-                const size_t nc_p1 = nc + 1 * n_chunk_n_cols;
-                const size_t nc_p2 = nc + 2 * n_chunk_n_cols;
+        // main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1})
+        for (int i = 0; i < n_chunk_cnt; ++i) {
+            const size_t nc    = i * n_chunk_n_cols;
+            const size_t nc_p1 = nc + 1 * n_chunk_n_cols;
+            const size_t nc_p2 = nc + 2 * n_chunk_n_cols;
 
-                const size_t n_cols    = hex_smin(n - nc, n_chunk_n_cols);
-                const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols);
-                const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols);
+            const size_t n_cols    = hex_smin(n - nc, n_chunk_n_cols);
+            const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols);
+            const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols);
 
-                // issue A_{i+2}: DMA push (non-blocking)
-                if (i + 2 < n_chunk_cnt) {
-                    const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride;
-                    dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
-                }
+            // issue A_{i+2}: DMA push (non-blocking)
+            if (i + 2 < n_chunk_cnt) {
+                const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride;
+                dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
+            }
 
-                // wait C_i: block until prologue/previous C completes
-                hmx_queue_pop(ctx->hmx_queue);
-
-                // submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below)
-                // job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's
-                // counterpart — and (i+1)%2 was last used by C_{i-1} which completed
-                // before C_i was submitted.
-                if (i + 1 < n_chunk_cnt) {
-                    hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2],
-                                        (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2],
-                                        vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
-                                        hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
-                    hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]));
-                }
+            // wait C_i: block until prologue/previous C completes
+            hmx_queue_pop(ctx->hmx_queue);
+
+            // submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below)
+            // job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's
+            // counterpart — and (i+1)%2 was last used by C_{i-1} which completed
+            // before C_i was submitted.
+            if (i + 1 < n_chunk_cnt) {
+                hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2],
+                                    (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2],
+                                    vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
+                                    hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+                hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]));
+            }
 
-                // D_i: store output (multi-thread HVX, parallel with C_{i+1})
-                float *output_chunk = dst + (mr * n + nc);
-                transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n);
+            // D_i: store output (multi-thread HVX, parallel with C_{i+1})
+            float *output_chunk = dst + (mr * n + nc);
+            transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n);
 
-                // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
-                if (i + 2 < n_chunk_cnt) {
-                    dma_queue_pop(ctx->dma[0]);
-                    dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
-                }
+            // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
+            if (i + 2 < n_chunk_cnt) {
+                dma_queue_pop(ctx->dma[0]);
+                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
             }
         }
-
-        hmx_queue_suspend(ctx->hmx_queue);
     }
 
+    hmx_queue_suspend(ctx->hmx_queue);
+
     TIMER_STOP(total);
 
 #if defined(ENABLE_PROFILE_TIMERS)
-    FARF(HIGH, "%s: %lld us, m=%d k=%d n=%d pipeline=%d", __func__, TIMER_US(total), m, k, n, use_pipeline);
+    FARF(HIGH, "hex-mm-q: %lld us : m %d k %d n %d", TIMER_US(total), m, k, n);
     if (!use_pipeline) {
         FARF(HIGH, "  activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us",
              TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store));
@@ -1370,15 +1061,15 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
 
 //
 
-static inline int hmx_matmul_batch_r2(const hmx_matmul_w16a32_batched_params_t *params) {
+static inline int hmx_matmul_batch_r2(const hmx_matmul_f16_f32_batched_params_t *params) {
     return params->ne02 > 0 ? params->ne12 / params->ne02 : 1;
 }
 
-static inline int hmx_matmul_batch_r3(const hmx_matmul_w16a32_batched_params_t *params) {
+static inline int hmx_matmul_batch_r3(const hmx_matmul_f16_f32_batched_params_t *params) {
     return params->ne03 > 0 ? params->ne13 / params->ne03 : 1;
 }
 
-static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params,
+static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params,
                                                         int dst_b2, int dst_b3) {
     const int r2 = hmx_matmul_batch_r2(params);
     const int r3 = hmx_matmul_batch_r3(params);
@@ -1387,37 +1078,36 @@ static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_w16a32_
                              (size_t) (dst_b3 / r3) * params->src0_nb3);
 }
 
-static inline const float *hmx_matmul_activation_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params,
+static inline const float *hmx_matmul_activation_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params,
                                                            int dst_b2, int dst_b3) {
     return (const float *) ((const uint8_t *) params->activation +
                             (size_t) dst_b2 * params->src1_nb2 +
                             (size_t) dst_b3 * params->src1_nb3);
 }
 
-static inline float *hmx_matmul_dst_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params,
+static inline float *hmx_matmul_dst_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params,
                                               int dst_b2, int dst_b3) {
     return (float *) ((uint8_t *) params->dst +
                       (size_t) dst_b2 * params->dst_nb2 +
                       (size_t) dst_b3 * params->dst_nb3);
 }
 
-static int hmx_mat_mul_permuted_w16a32_batched_legacy(struct htp_context *ctx,
-                                                      const hmx_matmul_w16a32_batched_params_t *params) {
+static int hmx_matmul_f16_f32_batched_legacy(struct htp_context *ctx,
+                                                      const hmx_matmul_f16_f32_batched_params_t *params) {
     int ret = 0;
     for (int b3 = 0; b3 < params->ne13 && ret == 0; ++b3) {
         for (int b2 = 0; b2 < params->ne12 && ret == 0; ++b2) {
-            ret = hmx_mat_mul_permuted_w16a32(ctx,
-                                              hmx_matmul_dst_batch_ptr(params, b2, b3),
-                                              hmx_matmul_activation_batch_ptr(params, b2, b3),
-                                              hmx_matmul_weight_batch_ptr(params, b2, b3),
-                                              params->m, params->k, params->n,
-                                              params->act_stride, params->weight_stride);
+            ret = hmx_matmul_f16_f32(ctx, hmx_matmul_dst_batch_ptr(params, b2, b3),
+                                           hmx_matmul_activation_batch_ptr(params, b2, b3),
+                                           hmx_matmul_weight_batch_ptr(params, b2, b3),
+                                           params->m, params->k, params->n,
+                                           params->act_stride, params->weight_stride);
         }
     }
     return ret;
 }
 
-int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmul_w16a32_batched_params_t *params) {
+int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params) {
     if (!ctx || !params || !params->dst || !params->activation || !params->permuted_weight) { return -1; }
     if (!params->m || !params->k || !params->n) { return -1; }
     if (params->act_stride < params->k || params->weight_stride < params->k || params->dst_stride < params->n) { return -1; }
@@ -1435,7 +1125,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
 
     if (group_size <= 1) {
         FARF(HIGH, "%s: no dim2 GQA reuse (group=%d), using legacy batched loop", __func__, group_size);
-        return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
+        return hmx_matmul_f16_f32_batched_legacy(ctx, params);
     }
 
     // Grouped path: reuse interleaved weight across all q_heads sharing a
@@ -1464,7 +1154,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
                            /*m_block_cost=*/(size_t) params->n,
                            /*n_block_cost=*/(size_t) params->m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
         FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__);
-        return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
+        return hmx_matmul_f16_f32_batched_legacy(ctx, params);
     }
 
     const size_t act_head_stride      = m_chunk_n_rows * (size_t) params->k;  // fp16 elements between heads
@@ -1486,7 +1176,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
 
     if ((size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base) > vtcm_budget) {
         FARF(HIGH, "%s: grouped layout overflowed VTCM, falling back to legacy batched loop", __func__);
-        return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
+        return hmx_matmul_f16_f32_batched_legacy(ctx, params);
     }
 
     hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00));  // scale: 1.0, bias: 0.0 in FP16
@@ -1614,7 +1304,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
 
 //
 
-int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
+int hmx_matmul_f16_f32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
                                 const __fp16 *restrict permuted_weight, int m, int k, int n,
                                 int act_stride, int weight_stride) {
     if (!dst || !activation || !permuted_weight || !m || !n || !k) { return -1; }
diff --git a/cpp/ggml-hexagon/htp/hmx-ops.h b/cpp/ggml-hexagon/htp/hmx-ops.h
index 21c8b524..6fc53a13 100644
--- a/cpp/ggml-hexagon/htp/hmx-ops.h
+++ b/cpp/ggml-hexagon/htp/hmx-ops.h
@@ -33,14 +33,14 @@ typedef struct {
     size_t        src1_nb3;
     size_t        dst_nb2;
     size_t        dst_nb3;
-} hmx_matmul_w16a32_batched_params_t;
+} hmx_matmul_f16_f32_batched_params_t;
 
 // HMX matrix multiplication — tile-permuted FP16 weights, FP32 activation/output
 // act_stride: activation row stride in elements (= k for contiguous, or
 //             nb[1]/sizeof(float) for permuted tensors like attention Q).
 // weight_stride: weight row stride in elements (= k for compact weights, or
 //                nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK).
-int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx,
+int hmx_matmul_f16_f32(struct htp_context *ctx,
                                 float *restrict dst,
                                 const float *activation,
                                 const __fp16 *permuted_weight,
@@ -48,13 +48,12 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx,
                                 int act_stride,
                                 int weight_stride);
 
-// Batched F16 wrapper over hmx_mat_mul_permuted_w16a32.
+// Batched F16 wrapper over hmx_mat_mul_f16_f32.
 // Batch semantics match lm_ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3.
-int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx,
-                                        const hmx_matmul_w16a32_batched_params_t *params);
+int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params);
 
-// HMX matrix multiplication — tile-permuted quantised weights (Q4_0/Q8_0/IQ4_NL)
-int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx,
+// HMX matrix multiplication — quantised weights (Q4_0/Q8_0/IQ4_NL/MXFP4)
+int hmx_matmul_q_f32(struct htp_context *ctx,
                                       float *restrict dst,
                                       const float *activation,
                                       const uint8_t *permuted_weight,
diff --git a/cpp/ggml-hexagon/htp/main.c b/cpp/ggml-hexagon/htp/main.c
index f80de9bb..2a2cab8f 100644
--- a/cpp/ggml-hexagon/htp/main.c
+++ b/cpp/ggml-hexagon/htp/main.c
@@ -87,35 +87,37 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
         }
     }
 
+#if __HVX_ARCH__ >= 75
     {
-        // Power on HMX
+        // Power on HMX and set HMX clock
         HAP_power_request_t request;
         memset(&request, 0, sizeof(HAP_power_request_t));
-        request.type         = HAP_power_set_HMX;
-        request.hmx.power_up = TRUE;
-        FARF(ALWAYS, "Powering HMX on\n");
-        err = HAP_power_set((void *) &ctx, &request);
+        request.type = HAP_power_set_HMX_v2;
+        request.hmx_v2.set_power     = TRUE;
+        request.hmx_v2.power_up      = TRUE;
+        request.hmx_v2.set_clock     = TRUE;
+        request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
+        request.hmx_v2.min_corner    = HAP_DCVS_EXP_VCORNER_MAX;
+        request.hmx_v2.max_corner    = HAP_DCVS_EXP_VCORNER_MAX;
+        request.hmx_v2.perf_mode     = HAP_CLK_PERF_HIGH;
+        FARF(ALWAYS, "Setting HMX clock\n");
+        err = HAP_power_set((void *) ctx, &request);
         if (err != AEE_SUCCESS) {
-            FARF(ERROR, "Error powering on HMX.");
+            FARF(ERROR, "Error setting HMX clock.");
             return err;
         }
     }
-
-#if __HVX_ARCH__ >= 75
+#else
     {
-        // Set HMX clock
+        // Power on HMX
         HAP_power_request_t request;
         memset(&request, 0, sizeof(HAP_power_request_t));
-        request.type = HAP_power_set_HMX_v2;
-        request.hmx_v2.set_clock = TRUE;
-        request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
-        request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX;
-        request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX;
-        request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH;
-        FARF(ALWAYS, "Setting HMX clock\n");
-        err = HAP_power_set((void *) &ctx, &request);
+        request.type         = HAP_power_set_HMX;
+        request.hmx.power_up = TRUE;
+        FARF(ALWAYS, "Powering HMX on\n");
+        err = HAP_power_set((void *) ctx, &request);
         if (err != AEE_SUCCESS) {
-            FARF(ERROR, "Error setting HMX clock.");
+            FARF(ERROR, "Error powering on HMX.");
             return err;
         }
     }
diff --git a/cpp/ggml-hexagon/htp/matmul-ops.c b/cpp/ggml-hexagon/htp/matmul-ops.c
index f0f06774..32b4269e 100644
--- a/cpp/ggml-hexagon/htp/matmul-ops.c
+++ b/cpp/ggml-hexagon/htp/matmul-ops.c
@@ -2995,7 +2995,6 @@ int op_matmul(struct htp_ops_context * octx) {
     //  is handled by HMX itself; when M < 32  fall back to HVX.
     const int m_total = (int) src1->ne[1];
     const int m_hmx   = m_total & ~31;   // 0 when M < 32
-
     if (m_hmx == 0) {
         return op_matmul_hvx(octx);
     }
@@ -3020,7 +3019,7 @@ int op_matmul(struct htp_ops_context * octx) {
 
     if (src0->type == HTP_TYPE_F16) {
         if (is_batched) {
-            hmx_matmul_w16a32_batched_params_t batch_params = {
+            hmx_matmul_f16_f32_batched_params_t batch_params = {
                 .dst             = (float *) dst->data,
                 .activation      = (float *) src1->data,
                 .permuted_weight = (const __fp16 *) src0->data,
@@ -3041,15 +3040,14 @@ int op_matmul(struct htp_ops_context * octx) {
                 .dst_nb2         = dst->nb[2],
                 .dst_nb3         = dst->nb[3],
             };
-            ret = hmx_mat_mul_permuted_w16a32_batched(octx->ctx, &batch_params);
+            ret = hmx_matmul_f16_f32_batched(octx->ctx, &batch_params);
         } else {
-            ret = hmx_mat_mul_permuted_w16a32(octx->ctx,
+            ret = hmx_matmul_f16_f32(octx->ctx,
                     (float*) dst->data, (float*) src1->data, (const __fp16 *) src0->data,
                     m_total, k, n, act_stride, wgt_stride);
         }
     } else {
-        ret = hmx_mat_mul_permuted_qk_0_d16a32(octx->ctx,
-                    (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data,
+        ret = hmx_matmul_q_f32(octx->ctx, (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data,
                     m_total, k, n, (int) src0->type);
     }
 
diff --git a/cpp/ggml-hexagon/htp/rope-ops.c b/cpp/ggml-hexagon/htp/rope-ops.c
index dc93b25c..0948a96e 100644
--- a/cpp/ggml-hexagon/htp/rope-ops.c
+++ b/cpp/ggml-hexagon/htp/rope-ops.c
@@ -107,7 +107,7 @@ static inline void rope_yarn_one(float theta, float freq_scale, float * corr_dim
     cache[i0 + 1] = sinf(theta_final) * mscale_final;
 }
 
-static void rope_cache_init(const float    theta_base,
+static __attribute__((noinline)) void rope_cache_init(const float    theta_base,
                             const float    freq_scale,
                             const float *  freq_factors,
                             float *        corr_dims,
@@ -129,7 +129,7 @@ static void rope_cache_init(const float    theta_base,
 
 // pos_t/h/w/e: the four position ids for this sequence step (t=time, h=height, w=width, e=extra).
 // sections[4]: number of head dims assigned to each position component.
-static void mrope_cache_init(const float    pos_t,
+static __attribute__((noinline)) void mrope_cache_init(const float    pos_t,
                              const float    pos_h,
                              const float    pos_w,
                              const float    pos_e,
diff --git a/cpp/ggml-hexagon/htp/ssm-conv.c b/cpp/ggml-hexagon/htp/ssm-conv.c
index 6d1b6a42..cebd5c0a 100644
--- a/cpp/ggml-hexagon/htp/ssm-conv.c
+++ b/cpp/ggml-hexagon/htp/ssm-conv.c
@@ -20,55 +20,56 @@
 #include "htp-ops.h"
 #include "hvx-utils.h"
 
-#define htp_ssm_conv_tensors_preamble                          \
-    const struct htp_tensor * restrict src0    = octx->src[0]; \
-    const struct htp_tensor * restrict src1    = octx->src[1]; \
-    const struct htp_tensor * restrict dst     = octx->dst;    \
-    struct htp_spad * restrict src0_spad = &octx->src0_spad; \
-    struct htp_spad * restrict src1_spad = &octx->src1_spad; \
-    struct htp_spad * restrict dst_spad  = &octx->dst_spad;  \
-                                                             \
-    const uint32_t ne00 = src0->ne[0];                       \
-    const uint32_t ne01 = src0->ne[1];                       \
-    const uint32_t ne02 = src0->ne[2];                       \
-    const uint32_t ne03 = src0->ne[3];                       \
-                                                             \
-    const uint32_t ne10 = src1->ne[0];                       \
-    const uint32_t ne11 = src1->ne[1];                       \
-    const uint32_t ne12 = src1->ne[2];                       \
-    const uint32_t ne13 = src1->ne[3];                       \
-                                                             \
-    const uint32_t ne0 = dst->ne[0];                         \
-    const uint32_t ne1 = dst->ne[1];                         \
-    const uint32_t ne2 = dst->ne[2];                         \
-    const uint32_t ne3 = dst->ne[3];                         \
-                                                             \
-    const uint32_t nb00 = src0->nb[0];                       \
-    const uint32_t nb01 = src0->nb[1];                       \
-    const uint32_t nb02 = src0->nb[2];                       \
-    const uint32_t nb03 = src0->nb[3];                       \
-                                                             \
-    const uint32_t nb10 = src1->nb[0];                       \
-    const uint32_t nb11 = src1->nb[1];                       \
-    const uint32_t nb12 = src1->nb[2];                       \
-    const uint32_t nb13 = src1->nb[3];                       \
-                                                             \
-    const uint32_t nb0 = dst->nb[0];                         \
-    const uint32_t nb1 = dst->nb[1];                         \
-    const uint32_t nb2 = dst->nb[2];                         \
+#define htp_ssm_conv_tensors_preamble                           \
+    const struct htp_tensor * restrict src0 = octx->src[0];     \
+    const struct htp_tensor * restrict src1 = octx->src[1];     \
+    const struct htp_tensor * restrict dst  = octx->dst;        \
+    struct htp_spad * restrict src0_spad    = &octx->src0_spad; \
+    struct htp_spad * restrict src1_spad    = &octx->src1_spad; \
+    struct htp_spad * restrict dst_spad     = &octx->dst_spad;  \
+                                                                \
+    const uint32_t ne00 = src0->ne[0];                          \
+    const uint32_t ne01 = src0->ne[1];                          \
+    const uint32_t ne02 = src0->ne[2];                          \
+    const uint32_t ne03 = src0->ne[3];                          \
+                                                                \
+    const uint32_t ne10 = src1->ne[0];                          \
+    const uint32_t ne11 = src1->ne[1];                          \
+    const uint32_t ne12 = src1->ne[2];                          \
+    const uint32_t ne13 = src1->ne[3];                          \
+                                                                \
+    const uint32_t ne0 = dst->ne[0];                            \
+    const uint32_t ne1 = dst->ne[1];                            \
+    const uint32_t ne2 = dst->ne[2];                            \
+    const uint32_t ne3 = dst->ne[3];                            \
+                                                                \
+    const uint32_t nb00 = src0->nb[0];                          \
+    const uint32_t nb01 = src0->nb[1];                          \
+    const uint32_t nb02 = src0->nb[2];                          \
+    const uint32_t nb03 = src0->nb[3];                          \
+                                                                \
+    const uint32_t nb10 = src1->nb[0];                          \
+    const uint32_t nb11 = src1->nb[1];                          \
+    const uint32_t nb12 = src1->nb[2];                          \
+    const uint32_t nb13 = src1->nb[3];                          \
+                                                                \
+    const uint32_t nb0 = dst->nb[0];                            \
+    const uint32_t nb1 = dst->nb[1];                            \
+    const uint32_t nb2 = dst->nb[2];                            \
     const uint32_t nb3 = dst->nb[3];
 
 struct htp_ssm_conv_context {
     struct htp_ops_context * octx;
     uint32_t nrows_per_thread;
+    uint32_t d_inner_tile;
     uint64_t t_start;
 };
 
-#define htp_ssm_conv_preamble                            \
+#define htp_ssm_conv_preamble                                                   \
     struct htp_ssm_conv_context * scctx = (struct htp_ssm_conv_context *) data; \
-    struct htp_ops_context * octx = scctx->octx;         \
-    htp_ssm_conv_tensors_preamble;                       \
-    dma_queue * dma_queue         = octx->ctx->dma[ith];
+    struct htp_ops_context *      octx  = scctx->octx;                          \
+    htp_ssm_conv_tensors_preamble;                                              \
+    dma_queue * dma_queue = octx->ctx->dma[ith];
 
 // Scalar FP32 SSM_CONV implementation
 static void ssm_conv_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
@@ -128,118 +129,211 @@ static void ssm_conv_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
          dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-// HVX FP32 SSM_CONV implementation - vectorizes across d_inner dimension
-static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void *data) {
-    htp_ssm_conv_preamble;
-
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
 
-    const int nc  = src1->ne[0]; // d_conv
-    const int ncs = src0->ne[0]; // d_conv - 1 + n_t
+// In-register 32x32 fp32 transpose using std 5-stage HVX vshuff butterfly.
+static inline void hvx_transpose_32x32_f32(HVX_Vector m[32]) {
+    HVX_Vector tmp[32];
 
-    const uint32_t d_conv  = src1->ne[0];
-    const uint32_t d_inner = src0->ne[1];
-    const uint32_t n_t     = dst->ne[1];
-    const uint32_t n_s     = dst->ne[2];
+    // Stage 0 (R = -4): pair (2i, 2i+1) for i = 0..15. m -> tmp.
+    for (int i = 0; i < 16; ++i) {
+        HVX_VectorPair p = Q6_W_vshuff_VVR(m[2*i + 1], m[2*i], -4);
+        tmp[2*i + 0] = Q6_V_lo_W(p);
+        tmp[2*i + 1] = Q6_V_hi_W(p);
+    }
 
-    const float * src0_data = (const float *) src0->data;
-    const float * src1_data = (const float *) src1->data;
-    float *       dst_data  = (float *) dst->data;
+    // Stage 1 (R = -8): per block of 4, pair (b+0, b+2) and (b+1, b+3). tmp -> m.
+    for (int b = 0; b < 32; b += 4) {
+        HVX_VectorPair p0 = Q6_W_vshuff_VVR(tmp[b + 2], tmp[b + 0], -8);
+        HVX_VectorPair p1 = Q6_W_vshuff_VVR(tmp[b + 3], tmp[b + 1], -8);
+        m[b + 0] = Q6_V_lo_W(p0); m[b + 1] = Q6_V_hi_W(p0);
+        m[b + 2] = Q6_V_lo_W(p1); m[b + 3] = Q6_V_hi_W(p1);
+    }
 
-    // Calculate row range for this thread
-    const int dr = scctx->nrows_per_thread;
-    const uint32_t ir0 = dr * ith;
-    const uint32_t ir1 = MIN(ir0 + dr, d_inner);
-    const uint32_t ir  = ir1 - ir0;
+    // Stage 2 (R = -16): per block of 8, pair (b+i, b+i+4) for i = 0..3. m -> tmp.
+    for (int b = 0; b < 32; b += 8) {
+        for (int i = 0; i < 4; ++i) {
+            HVX_VectorPair p = Q6_W_vshuff_VVR(m[b + i + 4], m[b + i], -16);
+            tmp[b + 2*i + 0] = Q6_V_lo_W(p);
+            tmp[b + 2*i + 1] = Q6_V_hi_W(p);
+        }
+    }
 
-    if (ir0 >= ir1) {
-        return;  // No work for this thread
+    // Stage 3 (R = -32): per block of 16, pair (b+i, b+i+8) for i = 0..7. tmp -> m.
+    for (int b = 0; b < 32; b += 16) {
+        for (int i = 0; i < 8; ++i) {
+            HVX_VectorPair p = Q6_W_vshuff_VVR(tmp[b + i + 8], tmp[b + i], -32);
+            m[b + 2*i + 0] = Q6_V_lo_W(p);
+            m[b + 2*i + 1] = Q6_V_hi_W(p);
+        }
     }
 
-    // src0 and src1 gather offsets
-    uint32_t __attribute__((aligned(VLEN))) src0_offsets[VLEN_FP32] = { 0 };
-    uint32_t __attribute__((aligned(VLEN))) src1_offsets[VLEN_FP32] = { 0 };
+    // Stage 4 (R = -64): pair (i, i+16) for i = 0..15. m -> tmp -> m.
+    for (int i = 0; i < 16; ++i) {
+        HVX_VectorPair p = Q6_W_vshuff_VVR(m[i + 16], m[i], -64);
+        tmp[2 * i + 0]   = Q6_V_lo_W(p);
+        tmp[2 * i + 1]   = Q6_V_hi_W(p);
+    }
 
-    for (uint32_t i = 0; i < VLEN_FP32; ++i) {
-        src0_offsets[i] = i * (ncs)    * sizeof(float);
-        src1_offsets[i] = i * (d_conv) * sizeof(float);
+    for (int i = 0; i < 32; ++i) {
+        m[i] = tmp[i];
     }
+}
 
-    const uint32_t src0_gather_len = VLEN * ncs;
-    const uint32_t src1_gather_len = VLEN * d_conv;
+// HVX FP32 SSM_CONV implementation - channel-vectorized HVX kernel with src0/src1
+// transposed into VTCM.
+//
+// VTCM layouts (per thread):
+//   src1_T : {d_inner_per_thread, d_conv}   — staged once per launch (small).
+//   src0_T : {d_inner_tile,     ncs}        — staged per d_inner-tile.
+//
+// d_inner_tile is chosen so that per-thread VTCM stays under the budget.
+// Each thread iterates ceil(d_inner_per_thread d_inner_tile) tiles serially.
+#define HTP_SSM_CONV_VTCM_BUDGET (1u << 20) // 1 MiB per thread
+
+// Scalar transpose: src1 {d_conv, d_inner} (DDR) -> {d_inner_per_thread, d_conv} (VTCM)
+static inline void transpose_src1(const float * src1_data,
+                                  uint32_t      src1_stride_inner,
+                                  uint32_t      i1_off,
+                                  uint32_t      d_inner_per_thread,
+                                  uint32_t      d_conv,
+                                  float *       src1_T) {
+    for (uint32_t i = 0; i < d_inner_per_thread; ++i) {
+        const float * src_row = src1_data + (i1_off + i) * src1_stride_inner;
+        for (uint32_t j = 0; j < d_conv; ++j) {
+            src1_T[j * d_inner_per_thread + i] = src_row[j];
+        }
+    }
+}
 
-    // gather scratchpads
-    HVX_Vector * src0_vec = (HVX_Vector *) (octx->ctx->vtcm_base + ith * VLEN*2 + 0);
-    HVX_Vector * src1_vec = (HVX_Vector *) (octx->ctx->vtcm_base + ith * VLEN*2 + VLEN);
+// HVX 32x32 src0 transpose: src0 {ncs, d_inner} (DDR) -> src0_T {d_inner_tile, ncs} (VTCM)
+static inline void transpose_src0_block(const float * src0_block,
+                                        uint32_t      ncs,
+                                        uint32_t      cb_n,
+                                        uint32_t      d_inner_tile,
+                                        float *       src0_T_block_dst,
+                                        uint32_t      cb /* dst column offset */) {
+    const uint32_t T_TILE = VLEN_FP32;
+
+    HVX_Vector __attribute__((aligned(VLEN))) sub[32];
+
+    for (uint32_t t0 = 0; t0 < ncs; t0 += T_TILE) {
+        const uint32_t t_n = MIN(T_TILE, ncs - t0);
+
+        // Load 32 rows (channels) of T_TILE samples; pad missing channels with zeros.
+        for (uint32_t r = 0; r < cb_n; ++r) {
+            const float * src_row = src0_block + r * ncs + t0;
+            if (t_n == T_TILE) {
+                sub[r] = *(const HVX_UVector *) src_row;
+            } else {
+                HVX_Vector v = hvx_vec_splat_f32(0.0f);
+                hvx_vec_store_u(&v, t_n * sizeof(float), hvx_vec_splat_f32(0.0f));
+
+                float __attribute__((aligned(VLEN))) tmp[VLEN_FP32] = { 0 };
+                for (uint32_t k = 0; k < t_n; ++k) tmp[k] = src_row[k];
+                v = *(const HVX_Vector *) tmp;
+                sub[r] = v;
+            }
+        }
+        for (uint32_t r = cb_n; r < T_TILE; ++r) {
+            sub[r] = hvx_vec_splat_f32(0.0f);
+        }
 
-    float * data_src0 = (float *) ((char *) src0->data + ir0 * src0->nb[1]);
-    float * data_src1 = (float *) ((char *) src1->data + ir0 * src1->nb[1]);
+        hvx_transpose_32x32_f32(sub);
 
-    uint8_t * spad_src0 = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread;
-    uint8_t * spad_src1 = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread;
+        // Store transposed sub-tile to src0_T at offsets (t0 + j) * d_inner_tile + cb.
+        // Only write the valid t_n rows of the transposed result.
+        for (uint32_t r = 0; r < t_n; ++r) {
+            float * dst = src0_T_block_dst + (t0 + r) * d_inner_tile + cb;
+            if (cb_n == T_TILE) {
+                *(HVX_UVector *) dst = sub[r];
+            } else {
+                hvx_vec_store_u(dst, cb_n * sizeof(float), sub[r]);
+            }
+        }
+    }
+}
 
-    // copy src1 workload to VTCM
-    dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src1, data_src1), nb11, nb11, ir);
+static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void *data) {
+    htp_ssm_conv_preamble;
 
-    // FARF(HIGH, "ssm-conv-src1-fetch %d: ir0 %u size %u\n", ith, ir0, nb11 * ir);
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
 
-    for (uint32_t i3 = 0; i3 < n_s; ++i3) {
-        float * src0_data_ptr = (float *) ((char *) data_src0 + i3 * (src0->nb[2]));
+    const uint32_t d_conv  = src1->ne[0];
+    const uint32_t d_inner = src0->ne[1];
+    const uint32_t n_t     = dst->ne[1];
+    const uint32_t n_s     = dst->ne[2];
+    const uint32_t ncs     = src0->ne[0];
 
-        // copy src0 workload to VTCM
-        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0, src0_data_ptr), nb01, nb01, ir);
+    const uint32_t src0_stride_inner = src0->nb[1] / sizeof(float);
+    const uint32_t src0_stride_seq   = src0->nb[2] / sizeof(float);
+    const uint32_t src1_stride_inner = src1->nb[1] / sizeof(float);
+    const uint32_t dst_stride_token  = dst->nb[1]  / sizeof(float);
+    const uint32_t dst_stride_seq    = dst->nb[2]  / sizeof(float);
 
-        // FARF(HIGH, "ssm-conv-src0-fetch %d: ir0 %u i3 %u size %u\n", ith, ir0, i3, nb01 * ir);
+    const uint32_t dr  = scctx->nrows_per_thread;
+    const uint32_t ir0 = dr * ith;
+    const uint32_t ir1 = MIN(ir0 + dr, d_inner);
 
-        dma_queue_flush(dma_queue);
+    if (ir0 >= ir1) {
+        return;
+    }
 
-        for (uint32_t i2 = 0; i2 < n_t; ++i2) {
-            float * dst_ptr = (float *) ((char *) dst->data + ir0 * (dst->nb[0]) + i2 * (dst->nb[1]) + i3 * (dst->nb[2]));
+    const uint32_t d_inner_per_thread = ir1 - ir0;
+    const uint32_t d_inner_tile       = scctx->d_inner_tile;
 
-            const uint32_t nvec = ir / VLEN_FP32;
-            const uint32_t nloe = ir % VLEN_FP32;
-            uint32_t i1 = 0;
+    const float * src0_data = (const float *) src0->data;
+    const float * src1_data = (const float *) src1->data;
+    float       * dst_data  = (float       *) dst->data;
 
-            for (uint32_t vi1 = 0; vi1 < nvec; vi1++) {
-                HVX_Vector acc_vec = Q6_V_vsplat_R(0);
+    // Per-thread VTCM regions.
+    float * src0_T = (float *)(octx->src0_spad.data + ith * octx->src0_spad.size_per_thread);
+    float * src1_T = (float *)(octx->src1_spad.data + ith * octx->src1_spad.size_per_thread);
 
-                for (uint32_t i0 = 0; i0 < d_conv; ++i0) {
-                    uint32_t src0_base = (uint32_t) spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0]);
-                    uint32_t src1_base = (uint32_t) spad_src1 + (i0 + i1 * nc)  * sizeof(float);
-                    Q6_vgather_ARMVw(src0_vec, src0_base, src0_gather_len, (*(const HVX_Vector *) src0_offsets));
-                    Q6_vgather_ARMVw(src1_vec, src1_base, src1_gather_len, (*(const HVX_Vector *) src1_offsets));
+    // Stage src1 weights once into VTCM in {d_inner_per_thread, d_conv} layout.
+    transpose_src1(src1_data, src1_stride_inner, ir0, d_inner_per_thread, d_conv, src1_T);
 
-                    HVX_Vector prod = Q6_Vqf32_vmpy_VsfVsf(*(const HVX_Vector *) src0_vec, *(const HVX_Vector *) src1_vec);
-                    acc_vec = Q6_Vqf32_vadd_Vqf32Vqf32(acc_vec, prod);
-                }
+    const uint32_t C_TILE = VLEN_FP32;
 
-                *(HVX_UVector *) (dst_ptr + i1) = Q6_Vsf_equals_Vqf32(acc_vec);
-                i1 += VLEN_FP32;
-            }
+    for (uint32_t i3 = 0; i3 < n_s; ++i3) {
+        for (uint32_t tile_off = 0; tile_off < d_inner_per_thread; tile_off += d_inner_tile) {
+            const uint32_t tile_n = MIN(d_inner_tile, d_inner_per_thread - tile_off);
 
-            if (nloe) {
-                HVX_Vector acc_vec = Q6_V_vsplat_R(0);
+            // Place src0 chunk into VTCM in {d_inner_tile, ncs} layout.
+            const float * src0_block = src0_data + i3 * src0_stride_seq + (ir0 + tile_off) * src0_stride_inner;
 
-                for (uint32_t i0 = 0; i0 < d_conv; ++i0) {
-                    uint32_t src0_base = (uint32_t) spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0]);
-                    uint32_t src1_base = (uint32_t) spad_src1 + (i0 + i1 * nc)  * sizeof(float);
-                    Q6_vgather_ARMVw(src0_vec, src0_base, src0_gather_len, (*(const HVX_Vector *) src0_offsets));
-                    Q6_vgather_ARMVw(src1_vec, src1_base, src1_gather_len, (*(const HVX_Vector *) src1_offsets));
+            for (uint32_t cb = 0; cb < tile_n; cb += C_TILE) {
+                const uint32_t cb_n = MIN(C_TILE, tile_n - cb);
+                transpose_src0_block(src0_block + cb * src0_stride_inner, ncs, cb_n, d_inner_tile, src0_T, cb);
+            }
 
-                    HVX_Vector prod = Q6_Vqf32_vmpy_VsfVsf(*(const HVX_Vector *) src0_vec, *(const HVX_Vector *) src1_vec);
-                    acc_vec = Q6_Vqf32_vadd_Vqf32Vqf32(acc_vec, prod);
+            for (uint32_t t = 0; t < n_t; ++t) {
+                for (uint32_t cb = 0; cb < tile_n; cb += C_TILE) {
+                    const uint32_t cb_n = MIN(C_TILE, tile_n - cb);
+
+                    HVX_Vector acc = hvx_vec_splat_f32(0.0f);
+                    for (uint32_t j = 0; j < d_conv; ++j) {
+                        HVX_Vector x = *(const HVX_Vector *) (src0_T + (t + j) * d_inner_tile + cb);
+                        HVX_Vector w = *(const HVX_Vector *) (src1_T + j * d_inner_per_thread + tile_off + cb);
+                        acc          = Q6_Vqf32_vadd_Vqf32Vqf32(acc, Q6_Vqf32_vmpy_VsfVsf(x, w));
+                    }
+                    HVX_Vector res = Q6_Vsf_equals_Vqf32(acc);
+
+                    float * dst_ptr = dst_data + i3 * dst_stride_seq + t * dst_stride_token + (ir0 + tile_off + cb);
+                    if (cb_n == C_TILE) {
+                        *(HVX_UVector *) dst_ptr = res;
+                    } else {
+                        hvx_vec_store_u(dst_ptr, cb_n * sizeof(float), res);
+                    }
                 }
-
-                hvx_vec_store_u(dst_ptr + i1, (ir - i1) * 4, Q6_Vsf_equals_Vqf32(acc_vec));
             }
         }
     }
 
     t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "ssm-conv-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n",
-         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0, ir1,
+    FARF(HIGH, "ssm-conv-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) tile=%u * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n",
+         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0, ir1, d_inner_tile,
          src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1],
          dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
@@ -264,46 +358,44 @@ int op_ssm_conv_f32(struct htp_ops_context * octx) {
 
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
         uint32_t use_hvx = 0;
-        if (d_inner >= VLEN_FP32 && d_inner % VLEN_FP32 == 0) {
-            int is_aligned = hex_is_aligned((void *) src0->data, VLEN) &&
-                             hex_is_aligned((void *) src1->data, VLEN) &&
-                             hex_is_aligned((void *) dst->data, VLEN);
-
-            if (is_aligned) {
-                use_hvx = 1;
-            }
+        if (d_inner >= VLEN_FP32 && n_t >= VLEN_FP32) {
+            use_hvx = 1;
         }
 
-        if (use_hvx) {
-            scctx.nrows_per_thread  = (d_inner + n_threads - 1) / n_threads; // d_inner chunks per thread
-            scctx.nrows_per_thread += (scctx.nrows_per_thread & 1); // round up to even
+        scctx.nrows_per_thread  = (d_inner + n_threads - 1) / n_threads;
+        scctx.nrows_per_thread += (scctx.nrows_per_thread & 1);
 
-            octx->src0_spad.size_per_thread = hex_round_up(scctx.nrows_per_thread * nb01, 256);
-            octx->src1_spad.size_per_thread = hex_round_up(scctx.nrows_per_thread * nb11, 256);
-            octx->dst_spad.size_per_thread  = hex_round_up(scctx.nrows_per_thread * sizeof(float), 256);
+        const uint32_t d_inner_per_thread = scctx.nrows_per_thread;
+        const uint32_t ncs                = src0->ne[0];
 
-            octx->src0_spad.size = octx->src0_spad.size_per_thread * n_threads;
-            octx->src1_spad.size = octx->src1_spad.size_per_thread * n_threads;
-            octx->dst_spad.size  = octx->dst_spad.size_per_thread  * n_threads;
+        const uint32_t src1_T_size = hex_round_up(d_conv * d_inner_per_thread * sizeof(float), 256);
+        const uint32_t src0_T_max = HTP_SSM_CONV_VTCM_BUDGET > src1_T_size ? HTP_SSM_CONV_VTCM_BUDGET - src1_T_size : 0;
 
-            // Compute gather scratchpad size for src0 and src1
-            const size_t gather_spad_size = n_threads * VLEN * 2;
+        uint32_t d_inner_tile = (src0_T_max / sizeof(float)) / ncs;
+        d_inner_tile -= (d_inner_tile % VLEN_FP32);
+        if (d_inner_tile == 0) {
+            FARF(HIGH, "ssm_conv-f32: inner tile rounds to 0 (ncs=%u), falling back to scalar\n", ncs);
+            use_hvx = 0;
+        } else {
+            scctx.d_inner_tile = d_inner_tile;
 
-            octx->src0_spad.data = octx->ctx->vtcm_base + gather_spad_size;     octx->src0_spad.src = NULL;
-            octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL;
-            octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size; octx->dst_spad.src  = NULL;
+            octx->src0_spad.size_per_thread = hex_round_up(d_inner_tile * ncs * sizeof(float), 256);
+            octx->src1_spad.size_per_thread = src1_T_size;
+            octx->dst_spad.size_per_thread  = 0;
 
-            FARF(HIGH, "ssm_conv-f32: gather-spad:%zu spad-per-thread:(%u:%u:%u) spad-sizes:(%u:%u:%u) spad-data:(%p:%p:%p)\n",
-                gather_spad_size, octx->src0_spad.size_per_thread, octx->src1_spad.size_per_thread,
-                octx->dst_spad.size_per_thread, octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size,
-                octx->src0_spad.data, octx->src1_spad.data, octx->dst_spad.data);
+            octx->src0_spad.size = octx->src0_spad.size_per_thread * n_threads;
+            octx->src1_spad.size = octx->src1_spad.size_per_thread * n_threads;
+            octx->dst_spad.size  = 0;
 
-            const size_t total_spad_size =
-                gather_spad_size + octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+            octx->src0_spad.data = octx->ctx->vtcm_base;
+            octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+            octx->src0_spad.src  = NULL;
+            octx->src1_spad.src  = NULL;
 
-            if (total_spad_size > octx->ctx->vtcm_size) {
-                FARF(HIGH, "ssm_conv-f32: HVX scratchpad size %zu exceeds VTCM size %zu", total_spad_size,
-                     octx->ctx->vtcm_size);
+            const size_t total_spad = octx->src0_spad.size + octx->src1_spad.size;
+            if (total_spad > octx->ctx->vtcm_size) {
+                FARF(HIGH, "ssm_conv-f32: scratchpad %zu exceeds VTCM %zu, falling back to scalar\n",
+                     total_spad, octx->ctx->vtcm_size);
                 use_hvx = 0;
             }
         }
diff --git a/cpp/ggml-metal/ggml-metal-ops.cpp b/cpp/ggml-metal/ggml-metal-ops.cpp
index a198242d..78b5917c 100644
--- a/cpp/ggml-metal/ggml-metal-ops.cpp
+++ b/cpp/ggml-metal/ggml-metal-ops.cpp
@@ -564,9 +564,20 @@ int lm_ggml_metal_op_concat(lm_ggml_metal_op_t ctx, int idx) {
     lm_ggml_metal_encoder_set_buffer  (enc, lm_ggml_metal_get_buffer_id(op->src[1]), 2);
     lm_ggml_metal_encoder_set_buffer  (enc, lm_ggml_metal_get_buffer_id(op),         3);
 
-    const int nth = std::min(1024, ne0);
+    int nth = std::min(256, ne0);
 
-    lm_ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
+    // when rows are small, we can batch them together in a single threadgroup
+    int nrptg = 1;
+    if (nth < 256) {
+        nrptg = std::min((256 + nth - 1) / nth, ne1);
+        if (nrptg * nth > 256) {
+            nrptg = 256 / nth;
+        }
+    }
+
+    const int nw0 = (ne1 + nrptg - 1) / nrptg;
+
+    lm_ggml_metal_encoder_dispatch_threadgroups(enc, nw0, ne2, ne3, nth, nrptg, 1);
 
     return 1;
 }
@@ -1786,7 +1797,7 @@ int lm_ggml_metal_op_set(lm_ggml_metal_op_t ctx, int idx) {
         nk0 = ne10/lm_ggml_blck_size(op->type);
     }
 
-    int nth = std::min<int>(nk0, lm_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    int nth = std::min<int>(nk0*ne11, 256);
 
     // when rows are small, we can batch them together in a single threadgroup
     int nrptg = 1;
@@ -1797,7 +1808,7 @@ int lm_ggml_metal_op_set(lm_ggml_metal_op_t ctx, int idx) {
             nrptg = (nth + nk0 - 1)/nk0;
             nth   = nk0;
 
-            if (nrptg*nth > lm_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+            if (nrptg*nth > 256) {
                 nrptg--;
             }
         }
diff --git a/cpp/ggml-metal/ggml-metal.metal b/cpp/ggml-metal/ggml-metal.metal
index 576c3aef..32f2b631 100644
--- a/cpp/ggml-metal/ggml-metal.metal
+++ b/cpp/ggml-metal/ggml-metal.metal
@@ -10555,7 +10555,11 @@ kernel void kernel_concat(
 
     const int i3 = tgpig.z;
     const int i2 = tgpig.y;
-    const int i1 = tgpig.x;
+    const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y;
+
+    if (i1 >= args.ne1) {
+        return;
+    }
 
     int o[4] = {0, 0, 0, 0};
     o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));
diff --git a/cpp/ggml-opencl/ggml-opencl.cpp b/cpp/ggml-opencl/ggml-opencl.cpp
index 06fa274a..64e5f38e 100644
--- a/cpp/ggml-opencl/ggml-opencl.cpp
+++ b/cpp/ggml-opencl/ggml-opencl.cpp
@@ -375,6 +375,11 @@ struct lm_ggml_backend_opencl_device_context {
     lm_ggml_backend_buffer_type buffer_type;
 
     cl_context context = nullptr;
+
+    GPU_FAMILY     gpu_family = GPU_FAMILY::UNKNOWN;
+    ADRENO_GPU_GEN adreno_gen = ADRENO_GPU_GEN::ADRENO_UNKNOWN;
+
+    size_t global_mem_size = 0;
 };
 
 // backend context
@@ -384,6 +389,18 @@ struct lm_ggml_backend_opencl_context {
     cl_device_id device;
     std::string device_name;
 
+    lm_ggml_cl_version platform_version;
+    lm_ggml_cl_version opencl_c_version;
+
+    // argsort is loaded in supports_op because its availability depends on how
+    // many workgroups are allowed, which requires kernel compilation.
+    bool kernels_loaded_argsort = false;
+    // flash attn is loaded in supports_op because it contains multiple variants
+    // and takes time to compile, so we want to only compile it when needed.
+    bool kernels_loaded_flash_attn = false;
+    // rest of the kernels are currently always loaded in alloc_buffer.
+    bool kernels_loaded = false;
+
     std::string driver_version;
 
     GPU_FAMILY gpu_family;
@@ -781,6 +798,8 @@ struct lm_ggml_backend_opencl_context {
 #endif // LM_GGML_OPENCL_USE_ADRENO_KERNELS
 
     void free() {
+        clFinish(queue);
+
         ref_count--;
         if (ref_count == 0) {
 #ifdef LM_GGML_OPENCL_PROFILING
@@ -793,6 +812,9 @@ struct lm_ggml_backend_opencl_context {
 
 // All registered devices with a default device in the front.
 static std::vector<lm_ggml_backend_device> g_lm_ggml_backend_opencl_devices;
+// All device contexts associated with the devices above.
+// The devices live as long as the process, so do the contexts.
+static std::vector<std::unique_ptr<lm_ggml_backend_opencl_device_context>> g_lm_ggml_backend_opencl_dev_ctxs;
 
 inline std::string read_file(const std::string &path) {
   std::ifstream ifs(path);
@@ -836,12 +858,120 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
     return p;
 }
 
-static void load_cl_kernels(lm_ggml_backend_opencl_context *backend_ctx, lm_ggml_cl_version opencl_c_version) {
+static void load_cl_kernels_argsort(lm_ggml_backend_opencl_context *backend_ctx) {
+    // compiler options for general kernels
+    auto opencl_c_std =
+        std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
+    std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
+                               " -cl-mad-enable -cl-unsafe-math-optimizations"
+                               " -cl-finite-math-only -cl-fast-relaxed-math";
+
+    // argsort
+    if (!backend_ctx->kernels_loaded_argsort) {
+        cl_int err;
+#ifdef LM_GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "argsort.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("argsort.cl");
+#endif
+        backend_ctx->program_argsort_f32_i32 =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
+        backend_ctx->kernels_loaded_argsort = true;
+    }
+}
+
+static void load_cl_kernels_flash_attn(lm_ggml_backend_opencl_context *backend_ctx) {
+    // compiler options for general kernels
+    auto opencl_c_std =
+        std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
+    std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
+                               " -cl-mad-enable -cl-unsafe-math-optimizations"
+                               " -cl-finite-math-only -cl-fast-relaxed-math";
+
+    // flash_attn
+    if (!backend_ctx->kernels_loaded_flash_attn) {
+        cl_int err;
+
+        #ifdef LM_GGML_OPENCL_EMBED_KERNELS
+                const std::string kernel_src_f16 {
+                    #include "flash_attn_f16.cl.h"
+                };
+                const std::string kernel_src_f32 {
+                    #include "flash_attn_f32.cl.h"
+                };
+                const std::string kernel_src_f32_f16 {
+                    #include "flash_attn_f32_f16.cl.h"
+                };
+        #else
+                const std::string kernel_src_f16 = read_file("flash_attn_f16.cl");
+                const std::string kernel_src_f32 = read_file("flash_attn_f32.cl");
+                const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl");
+        #endif
+
+        if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
+            const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
+                { 40,  40, 32, 32}, { 64,  64, 64, 64}, { 80,  80, 64, 32}, { 96,  96, 64, 32},
+                {112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
+                {192, 192, 16, 16}, {256, 256, 16, 16},
+            };
+
+            for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) {
+                const int dk = fa_dims[i].dk;
+                const int dv = fa_dims[i].dv;
+                const int bm = fa_dims[i].bm;
+                const int bn = fa_dims[i].bn;
+                std::string OPTS = compile_opts +
+                    " -D DK=" + std::to_string(dk) +
+                    " -D DV=" + std::to_string(dv) +
+                    " -D BLOCK_M=" + std::to_string(bm) +
+                    " -D BLOCK_N=" + std::to_string(bn);
+
+                cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS);
+                cl_kernel k_f16, k_f16_q1;
+                CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err));
+                CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err));
+                backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16;
+                backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1;
+                CL_CHECK(clReleaseProgram(prog_f16));
+
+                cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS);
+                cl_kernel k_f32, k_f32_q1;
+                CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err));
+                CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err));
+                backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32;
+                backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1;
+                CL_CHECK(clReleaseProgram(prog_f32));
+
+                cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS);
+                cl_kernel k_f32_f16, k_f32_f16_q1;
+                CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err));
+                CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err));
+                backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16;
+                backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1;
+                CL_CHECK(clReleaseProgram(prog_f32_f16));
+
+                backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm;
+                backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn;
+            }
+            backend_ctx->kernels_loaded_flash_attn = true;
+        }
+    }
+}
+
+static void load_cl_kernels(lm_ggml_backend_opencl_context *backend_ctx) {
+    if (backend_ctx->kernels_loaded) {
+        return;
+    }
+
     cl_int err;
 
     // compiler options for general kernels
     auto opencl_c_std =
-        std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
+        std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
     std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
                                " -cl-mad-enable -cl-unsafe-math-optimizations"
                                " -cl-finite-math-only -cl-fast-relaxed-math";
@@ -1986,89 +2116,6 @@ static void load_cl_kernels(lm_ggml_backend_opencl_context *backend_ctx, lm_ggml
         LM_GGML_LOG_CONT(".");
     }
 
-    // flash_attn
-    {
-        #ifdef LM_GGML_OPENCL_EMBED_KERNELS
-                const std::string kernel_src_f16 {
-                    #include "flash_attn_f16.cl.h"
-                };
-                const std::string kernel_src_f32 {
-                    #include "flash_attn_f32.cl.h"
-                };
-                const std::string kernel_src_f32_f16 {
-                    #include "flash_attn_f32_f16.cl.h"
-                };
-        #else
-                const std::string kernel_src_f16 = read_file("flash_attn_f16.cl");
-                const std::string kernel_src_f32 = read_file("flash_attn_f32.cl");
-                const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl");
-        #endif
-
-        if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
-            const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
-                { 40,  40, 32, 32}, { 64,  64, 64, 64}, { 80,  80, 64, 32}, { 96,  96, 64, 32},
-                {112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
-                {192, 192, 16, 16}, {256, 256, 16, 16},
-            };
-
-            for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) {
-                const int dk = fa_dims[i].dk;
-                const int dv = fa_dims[i].dv;
-                const int bm = fa_dims[i].bm;
-                const int bn = fa_dims[i].bn;
-                std::string OPTS = compile_opts +
-                    " -D DK=" + std::to_string(dk) +
-                    " -D DV=" + std::to_string(dv) +
-                    " -D BLOCK_M=" + std::to_string(bm) +
-                    " -D BLOCK_N=" + std::to_string(bn);
-
-                cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS);
-                cl_kernel k_f16, k_f16_q1;
-                CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err));
-                CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err));
-                backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16;
-                backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1;
-                CL_CHECK(clReleaseProgram(prog_f16));
-
-                cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS);
-                cl_kernel k_f32, k_f32_q1;
-                CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err));
-                CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err));
-                backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32;
-                backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1;
-                CL_CHECK(clReleaseProgram(prog_f32));
-
-                cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS);
-                cl_kernel k_f32_f16, k_f32_f16_q1;
-                CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err));
-                CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err));
-                backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16;
-                backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1;
-                CL_CHECK(clReleaseProgram(prog_f32_f16));
-
-                backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm;
-                backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn;
-            }
-            LM_GGML_LOG_CONT(".");
-        }
-    }
-
-    // argsort
-    {
-#ifdef LM_GGML_OPENCL_EMBED_KERNELS
-        const std::string kernel_src {
-            #include "argsort.cl.h"
-        };
-#else
-        const std::string kernel_src = read_file("argsort.cl");
-#endif
-        backend_ctx->program_argsort_f32_i32 =
-            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-
-        CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
-        LM_GGML_LOG_CONT(".");
-    }
-
     // div
     {
 #ifdef LM_GGML_OPENCL_EMBED_KERNELS
@@ -3335,13 +3382,15 @@ static void load_cl_kernels(lm_ggml_backend_opencl_context *backend_ctx, lm_ggml
     }
 #endif // LM_GGML_OPENCL_USE_ADRENO_KERNELS
     LM_GGML_LOG_CONT("\n");
+    backend_ctx->kernels_loaded = true;
 }
 
 // XXX static lm_ggml_backend_opencl_context * lm_ggml_cl2_init(lm_ggml_backend_dev_t dev) {
 // XXX    static bool initialized = false;
 // XXX    static lm_ggml_backend_opencl_context *backend_ctx = nullptr;
 
-static lm_ggml_backend_opencl_context * lm_ggml_cl2_init(lm_ggml_backend_dev_t dev);
+static lm_ggml_backend_opencl_context * lm_ggml_cl_init(lm_ggml_backend_dev_t dev);
+static bool lm_ggml_opencl_is_device_supported(lm_ggml_backend_dev_t dev);
 
 namespace /* anonymous */ {
 extern struct lm_ggml_backend_device_i lm_ggml_backend_opencl_device_i;
@@ -3554,13 +3603,13 @@ static std::vector<lm_ggml_backend_device> lm_ggml_opencl_probe_devices(lm_ggml_
             /* .context = */ dev_ctx.get(),
         });
 
-        if (!lm_ggml_cl2_init(&found_devices.back())) {
+        if (!lm_ggml_opencl_is_device_supported(&found_devices.back())) {
             found_devices.pop_back();
-            LM_GGML_LOG_INFO("lm_ggml_opencl: drop unsupported device.\n");
+            LM_GGML_LOG_WARN("lm_ggml_opencl: drop unsupported device '%s'.\n", dev->name);
             continue;
         }
 
-        dev_ctx.release();
+        g_lm_ggml_backend_opencl_dev_ctxs.push_back(std::move(dev_ctx));
     }
 
     if (found_devices.size()) {
@@ -3577,8 +3626,79 @@ static std::vector<lm_ggml_backend_device> lm_ggml_opencl_probe_devices(lm_ggml_
     return found_devices;
 }
 
+// check if device should be accepted
+static bool lm_ggml_opencl_is_device_supported(lm_ggml_backend_dev_t dev) {
+    LM_GGML_ASSERT(dev);
+    LM_GGML_ASSERT(dev->context);
+
+    lm_ggml_backend_opencl_device_context * dev_ctx = (lm_ggml_backend_opencl_device_context *) dev->context;
+    LM_GGML_ASSERT(dev_ctx->platform);
+    LM_GGML_ASSERT(dev_ctx->device);
+
+    if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
+        strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
+        strstr(dev_ctx->device_version.c_str(), "Adreno")) {
+        dev_ctx->gpu_family = GPU_FAMILY::ADRENO;
+
+        // Usually device version contains the detailed device name
+        dev_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
+        if (dev_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
+            dev_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
+        }
+    } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
+        dev_ctx->gpu_family = GPU_FAMILY::INTEL;
+    } else {
+        LM_GGML_LOG_WARN("lm_ggml_opencl: unsupported GPU '%s'.\n", dev_ctx->device_name.c_str());
+        dev_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
+        return false;
+    }
+
+    lm_ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
+
+    // Check device OpenCL version, OpenCL 2.0 or above is required
+    lm_ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, dev_ctx->device);
+    if (opencl_c_version.major < 2) {
+        LM_GGML_LOG_WARN("lm_ggml_opencl: OpenCL 2.0 or above is required\n");
+        return false;
+    }
+
+#ifdef LM_GGML_OPENCL_USE_ADRENO_KERNELS
+    if (dev_ctx->gpu_family != GPU_FAMILY::ADRENO) {
+        LM_GGML_LOG_WARN("lm_ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
+            "run on an Adreno GPU or recompile with CMake option `-DLM_GGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
+        return false;
+    }
+#endif
+
+    size_t ext_str_size;
+    clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
+
+    char *ext_buffer = (char *)alloca(ext_str_size + 1);
+    clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
+    ext_buffer[ext_str_size] = '\0';
+
+    // Check if ext_buffer contains cl_khr_fp16
+    bool fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
+    if (!fp16_support) {
+        LM_GGML_LOG_WARN("lm_ggml_opencl: device does not support FP16\n");
+        return false;
+    }
+
+    // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
+    // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
+    if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
+        strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
+        LM_GGML_LOG_WARN("lm_ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
+            "(note that subgroups is an optional feature in OpenCL 3.0)\n");
+        return false;
+    }
+
+    clGetDeviceInfo(dev_ctx->device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &dev_ctx->global_mem_size, NULL);
+    return true;
+}
+
 // Initialize device if it is supported (returns nullptr if it is not).
-static lm_ggml_backend_opencl_context * lm_ggml_cl2_init(lm_ggml_backend_dev_t dev) {
+static lm_ggml_backend_opencl_context * lm_ggml_cl_init(lm_ggml_backend_dev_t dev) {
     LM_GGML_ASSERT(dev);
     LM_GGML_ASSERT(dev->context);
 
@@ -3600,33 +3720,12 @@ static lm_ggml_backend_opencl_context * lm_ggml_cl2_init(lm_ggml_backend_dev_t d
     // when the associated device is initialized
     backend_ctx->ref_count  = 0;
 
-    if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
-        strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
-        strstr(dev_ctx->device_version.c_str(), "Adreno")) {
-        backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
-        // Usually device version contains the detailed device name
-        backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
-        if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
-            backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
-        }
-
+    backend_ctx->gpu_family = dev_ctx->gpu_family;
+    backend_ctx->adreno_gen = dev_ctx->adreno_gen;
+    if (backend_ctx->gpu_family == GPU_FAMILY::ADRENO) {
         // Use wave size of 64 for all Adreno GPUs.
         backend_ctx->adreno_wave_size = 64;
-    } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
-        backend_ctx->gpu_family = GPU_FAMILY::INTEL;
-    } else {
-        LM_GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
-        backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-        return nullptr;
-    }
-
-#ifdef LM_GGML_OPENCL_USE_ADRENO_KERNELS
-    if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
-        LM_GGML_LOG_ERROR("lm_ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
-            "run on an Adreno GPU or recompile with CMake option `-DLM_GGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
-        return nullptr;
     }
-#endif
 
     // Populate backend device name
     backend_ctx->device_name = dev_ctx->device_name;
@@ -3635,13 +3734,10 @@ static lm_ggml_backend_opencl_context * lm_ggml_cl2_init(lm_ggml_backend_dev_t d
     cl_device_id device = backend_ctx->device;
 
     lm_ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
-
-    // Check device OpenCL version, OpenCL 2.0 or above is required
     lm_ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
-    if (opencl_c_version.major < 2) {
-        LM_GGML_LOG_ERROR("lm_ggml_opencl: OpenCL 2.0 or above is required\n");
-        return nullptr;
-    }
+
+    backend_ctx->platform_version = platform_version;
+    backend_ctx->opencl_c_version = opencl_c_version;
 
     // Check driver version
     size_t driver_version_str_size;
@@ -3664,34 +3760,21 @@ static lm_ggml_backend_opencl_context * lm_ggml_cl2_init(lm_ggml_backend_dev_t d
     char *ext_buffer = (char *)alloca(ext_str_size + 1);
     clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
     ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
+
     // Check if ext_buffer contains cl_khr_fp16
     backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
     LM_GGML_LOG_INFO("lm_ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
+
     // check Adreno large buffer support
     backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL;
 
-    // fp16 is required
-    if (!backend_ctx->fp16_support) {
-        LM_GGML_LOG_ERROR("lm_ggml_opencl: device does not support FP16\n");
-        return nullptr;
-    }
-
-    // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
-    // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
-    if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
-        strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
-        LM_GGML_LOG_ERROR("lm_ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
-            "(note that subgroups is an optional feature in OpenCL 3.0)\n");
-        return nullptr;
-    }
-
     cl_uint base_align_in_bits;
     CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
     LM_GGML_ASSERT(base_align_in_bits % 8u == 0);
     backend_ctx->alignment = base_align_in_bits / 8u;
     LM_GGML_LOG_INFO("lm_ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
 
-    clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &backend_ctx->global_mem_size, NULL);
+    backend_ctx->global_mem_size = dev_ctx->global_mem_size;
     LM_GGML_LOG_INFO("lm_ggml_opencl: global mem size: %zu MB\n", backend_ctx->global_mem_size/1024/1024);
 
     clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
@@ -3779,8 +3862,8 @@ static lm_ggml_backend_opencl_context * lm_ggml_cl2_init(lm_ggml_backend_dev_t d
 #endif
     CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
 
-    // Load kernels
-    load_cl_kernels(backend_ctx.get(), opencl_c_version);
+    // delay kernel loading until the first buffer is created
+    // load_cl_kernels(backend_ctx.get());
 
 #ifdef LM_GGML_OPENCL_USE_ADRENO_KERNELS
     // Allocate intermediate buffers and images
@@ -3822,22 +3905,9 @@ static lm_ggml_backend_opencl_context * lm_ggml_cl2_init(lm_ggml_backend_dev_t d
     return dev_ctx->backend_ctx;
 }
 
-static void lm_ggml_cl2_free(lm_ggml_backend_t backend) {
+static void lm_ggml_cl_free(lm_ggml_backend_t backend) {
     lm_ggml_backend_opencl_context * ctx = (lm_ggml_backend_opencl_context *) backend->context;
     ctx->free();
-
-    // The CL context is shared by all backends, release it if all backends have been released
-    bool should_release_opencl = true;
-    for (auto device : g_lm_ggml_backend_opencl_devices) {
-        lm_ggml_backend_opencl_device_context * ctx_dev = (lm_ggml_backend_opencl_device_context *) device.context;
-        if (ctx_dev->backend_ctx->ref_count > 0) {
-            should_release_opencl = false;
-        }
-    }
-
-    if (should_release_opencl) {
-        CL_CHECK(clReleaseContext(ctx->context));
-    }
 }
 
 #ifdef LM_GGML_OPENCL_USE_ADRENO_KERNELS
@@ -4421,7 +4491,7 @@ static const char * lm_ggml_backend_opencl_name(lm_ggml_backend_t backend) {
 }
 
 static void lm_ggml_backend_opencl_free(lm_ggml_backend_t backend) {
-    lm_ggml_cl2_free(backend);
+    lm_ggml_cl_free(backend);
 }
 
 static void lm_ggml_backend_opencl_set_tensor_async(lm_ggml_backend_t backend, lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -4460,14 +4530,17 @@ static void lm_ggml_backend_opencl_synchronize(lm_ggml_backend_t backend) {
 // enqueued to it won't start until commands in the other devices have
 // completed.
 static void sync_with_other_backends(lm_ggml_backend_opencl_context * backend_ctx) {
-    if (g_lm_ggml_backend_opencl_devices.size() < 2)
-      return; // No other devices to synchronize with.
+    if (g_lm_ggml_backend_opencl_devices.size() < 2) {
+        return; // No other devices to synchronize with.
+    }
 
     std::vector<cl_event> events;
     events.reserve(g_lm_ggml_backend_opencl_devices.size());
 
     for (lm_ggml_backend_device & backend_dev : g_lm_ggml_backend_opencl_devices) {
-        auto * other_backend_ctx = lm_ggml_cl2_init(&backend_dev);
+        lm_ggml_backend_opencl_device_context * dev_ctx = (lm_ggml_backend_opencl_device_context *) backend_dev.context;
+        auto * other_backend_ctx = dev_ctx->backend_ctx;
+
         if (backend_ctx != other_backend_ctx) {
             cl_event ev;
             CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
@@ -4620,7 +4693,7 @@ inline bool use_adreno_kernels(const lm_ggml_backend_opencl_context *backend_ctx
 inline bool use_adreno_moe_kernels(const lm_ggml_backend_opencl_context *backend_ctx, const lm_ggml_tensor *tensor) {
     LM_GGML_UNUSED(backend_ctx);
     int ne01 = tensor->ne[1];
-    return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
+    return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 32 == 0);
 }
 
 inline bool enable_adreno_trans_weight(const lm_ggml_backend_opencl_context *backend_ctx, const lm_ggml_tensor *tensor) {
@@ -4880,6 +4953,8 @@ static bool lm_ggml_opencl_supports_op(lm_ggml_backend_dev_t dev, const struct l
         case LM_GGML_OP_IM2COL:
             return true;
         case LM_GGML_OP_ARGSORT: {
+            load_cl_kernels_argsort(backend_ctx);
+
             cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
             int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
 
@@ -4897,6 +4972,8 @@ static bool lm_ggml_opencl_supports_op(lm_ggml_backend_dev_t dev, const struct l
             return op->src[0]->type == LM_GGML_TYPE_F32;
         case LM_GGML_OP_FLASH_ATTN_EXT:
             {
+                load_cl_kernels_flash_attn(backend_ctx);
+
                 const lm_ggml_tensor * q = op->src[0];
                 const lm_ggml_tensor * k = op->src[1];
                 const lm_ggml_tensor * v = op->src[2];
@@ -4964,7 +5041,7 @@ static lm_ggml_backend_i lm_ggml_backend_opencl_i = {
 
 lm_ggml_backend_t lm_ggml_backend_opencl_init(void) {
     lm_ggml_backend_dev_t dev = lm_ggml_backend_reg_dev_get(lm_ggml_backend_opencl_reg(), 0);
-    lm_ggml_backend_opencl_context *backend_ctx = lm_ggml_cl2_init(dev);
+    lm_ggml_backend_opencl_context *backend_ctx = lm_ggml_cl_init(dev);
 
     lm_ggml_backend_t backend = new lm_ggml_backend {
         /* .guid    = */ lm_ggml_backend_opencl_guid(),
@@ -5343,15 +5420,13 @@ static void lm_ggml_backend_opencl_buffer_free_buffer(lm_ggml_backend_buffer_t b
 }
 
 static void * lm_ggml_backend_opencl_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
-    lm_ggml_backend_opencl_context * backend_ctx = lm_ggml_cl2_init(buffer->buft->device);
-    return (void *) (uintptr_t) backend_ctx->alignment;
+    lm_ggml_backend_opencl_device_context * dev_ctx = (lm_ggml_backend_opencl_device_context *) buffer->buft->device->context;
+    return (void *) (uintptr_t) dev_ctx->backend_ctx->alignment;
 }
 
 static enum lm_ggml_status lm_ggml_backend_opencl_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, lm_ggml_tensor * tensor) {
     lm_ggml_backend_opencl_buffer_context * ctx = (lm_ggml_backend_opencl_buffer_context *) buffer->context;
 
-    lm_ggml_cl2_init(buffer->buft->device);
-
     if (tensor->view_src != nullptr) {
         LM_GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
 
@@ -5391,7 +5466,8 @@ static enum lm_ggml_status lm_ggml_backend_opencl_buffer_init_tensor(lm_ggml_bac
 }
 
 static void lm_ggml_backend_opencl_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    lm_ggml_backend_opencl_context *backend_ctx = lm_ggml_cl2_init(buffer->buft->device);
+    lm_ggml_backend_opencl_device_context * dev_ctx = (lm_ggml_backend_opencl_device_context *) buffer->buft->device->context;
+    lm_ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
 
     cl_context context = backend_ctx->context;
     cl_command_queue queue = backend_ctx->queue;
@@ -6626,7 +6702,8 @@ static void lm_ggml_backend_opencl_buffer_set_tensor(lm_ggml_backend_buffer_t bu
 static void lm_ggml_backend_opencl_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     LM_GGML_ASSERT(tensor->extra);
 
-    lm_ggml_backend_opencl_context *backend_ctx = lm_ggml_cl2_init(buffer->buft->device);
+    lm_ggml_backend_opencl_device_context * dev_ctx = (lm_ggml_backend_opencl_device_context *) buffer->buft->device->context;
+    lm_ggml_backend_opencl_context *backend_ctx = dev_ctx->backend_ctx;
 
     cl_context context = backend_ctx->context;
     cl_command_queue queue = backend_ctx->queue;
@@ -7470,8 +7547,9 @@ static void lm_ggml_backend_opencl_buffer_get_tensor(lm_ggml_backend_buffer_t bu
 }
 
 static void lm_ggml_backend_opencl_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
-    lm_ggml_backend_dev_t dev = buffer->buft->device;
-    lm_ggml_backend_opencl_context *backend_ctx = lm_ggml_cl2_init(dev);
+    lm_ggml_backend_opencl_device_context * dev_ctx = (lm_ggml_backend_opencl_device_context *) buffer->buft->device->context;
+    lm_ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
+
     cl_command_queue queue = backend_ctx->queue;
 
     lm_ggml_backend_opencl_buffer_context * ctx = (lm_ggml_backend_opencl_buffer_context *) buffer->context;
@@ -7511,7 +7589,8 @@ static const char * lm_ggml_backend_opencl_buffer_type_get_name(lm_ggml_backend_
 }
 
 static lm_ggml_backend_buffer_t lm_ggml_backend_opencl_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buffer_type, size_t size) {
-    lm_ggml_backend_opencl_context *backend_ctx = lm_ggml_cl2_init(buffer_type->device);
+    lm_ggml_backend_opencl_context *backend_ctx = lm_ggml_cl_init(buffer_type->device);
+    load_cl_kernels(backend_ctx);
 
     // clCreateBuffer returns -61 for size 0
     size = std::max(size, (size_t)1);
@@ -7534,15 +7613,15 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_opencl_buffer_type_alloc_buffer(
 }
 
 static size_t lm_ggml_backend_opencl_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buffer_type) {
-    lm_ggml_backend_opencl_context * backend_ctx = lm_ggml_cl2_init(buffer_type->device);
-    return backend_ctx->alignment;
+    lm_ggml_backend_opencl_device_context * dev_ctx = (lm_ggml_backend_opencl_device_context *) buffer_type->device->context;
+    return dev_ctx->backend_ctx->alignment;
 }
 
 static size_t lm_ggml_backend_opencl_buffer_type_get_max_size(lm_ggml_backend_buffer_type_t buffer_type) {
     static size_t max_size = -1;
     if (max_size == (size_t)-1) {
-        lm_ggml_backend_opencl_context * backend_ctx = lm_ggml_cl2_init(buffer_type->device);
-        max_size = backend_ctx->max_alloc_size;
+        lm_ggml_backend_opencl_device_context * dev_ctx = (lm_ggml_backend_opencl_device_context *) buffer_type->device->context;
+        max_size = dev_ctx->backend_ctx->max_alloc_size;
     }
     return max_size;
 }
@@ -7579,14 +7658,13 @@ static const char * lm_ggml_backend_opencl_device_get_description(lm_ggml_backen
 
 static void lm_ggml_backend_opencl_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
     lm_ggml_backend_opencl_device_context * dev_ctx = (lm_ggml_backend_opencl_device_context *) dev->context;
-    lm_ggml_backend_opencl_context * backend_ctx = (lm_ggml_backend_opencl_context *) dev_ctx->backend_ctx;
 
     static const size_t opencl_extra_margin = 1024ull*1024ull*1024ull;
 
     // OpenCL does not provide reliable currently-free device memory.
     // Use total/global memory as a best-effort upper bound.
     // Improved safety: Reduce by a 1GiB extra margin for common --fit
-    *total = backend_ctx->global_mem_size;
+    *total = dev_ctx->global_mem_size;
     *free  = *total > opencl_extra_margin ? *total - opencl_extra_margin : 0;
 }
 
@@ -7610,7 +7688,7 @@ static void lm_ggml_backend_opencl_device_get_props(lm_ggml_backend_dev_t dev, s
 }
 
 static lm_ggml_backend_t lm_ggml_backend_opencl_device_init(lm_ggml_backend_dev_t dev, const char * params) {
-    lm_ggml_backend_opencl_context * backend_ctx = lm_ggml_cl2_init(dev);
+    lm_ggml_backend_opencl_context * backend_ctx = lm_ggml_cl_init(dev);
     // Getting a new reference to the backend, increase ref_count
     backend_ctx->ref_count++;
 
@@ -7647,6 +7725,7 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_opencl_device_buffer_from_ptr(lm
 }
 
 static bool lm_ggml_backend_opencl_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
+    lm_ggml_cl_init(dev);
     return lm_ggml_opencl_supports_op(dev, op);
 }
 
@@ -7659,8 +7738,8 @@ static bool lm_ggml_backend_opencl_device_supports_buft(lm_ggml_backend_dev_t de
 
     // Check cl_context is the same. clEnqueue* commands may not use
     // buffers from another cl_context.
-    lm_ggml_backend_opencl_context * backend_ctx0 = lm_ggml_cl2_init(dev);
-    lm_ggml_backend_opencl_context * backend_ctx1 = lm_ggml_cl2_init(buft->device);
+    lm_ggml_backend_opencl_context * backend_ctx0 = lm_ggml_cl_init(dev);
+    lm_ggml_backend_opencl_context * backend_ctx1 = lm_ggml_cl_init(buft->device);
     return backend_ctx0->context == backend_ctx1->context;
 }
 
@@ -14218,7 +14297,7 @@ static void lm_ggml_cl_mul_mat_id(lm_ggml_backend_t backend, const lm_ggml_tenso
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -14434,7 +14513,7 @@ static void lm_ggml_cl_mul_mat_id(lm_ggml_backend_t backend, const lm_ggml_tenso
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -14610,7 +14689,7 @@ static void lm_ggml_cl_mul_mat_id(lm_ggml_backend_t backend, const lm_ggml_tenso
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -14786,7 +14865,7 @@ static void lm_ggml_cl_mul_mat_id(lm_ggml_backend_t backend, const lm_ggml_tenso
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -15039,7 +15118,7 @@ static void lm_ggml_cl_mul_mat_id(lm_ggml_backend_t backend, const lm_ggml_tenso
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -15212,7 +15291,7 @@ static void lm_ggml_cl_mul_mat_id(lm_ggml_backend_t backend, const lm_ggml_tenso
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -15390,7 +15469,7 @@ static void lm_ggml_cl_mul_mat_id(lm_ggml_backend_t backend, const lm_ggml_tenso
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -15565,7 +15644,7 @@ static void lm_ggml_cl_mul_mat_id(lm_ggml_backend_t backend, const lm_ggml_tenso
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
diff --git a/cpp/ggml-opencl/kernels/cvt.cl b/cpp/ggml-opencl/kernels/cvt.cl
index 31236698..c25eabdd 100644
--- a/cpp/ggml-opencl/kernels/cvt.cl
+++ b/cpp/ggml-opencl/kernels/cvt.cl
@@ -220,6 +220,10 @@ kernel void kernel_convert_block_q4_0_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK4_0;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -263,6 +267,10 @@ kernel void kernel_restore_block_q4_0_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK4_0;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -401,6 +409,10 @@ kernel void kernel_convert_block_q4_1_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK4_1;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -446,6 +458,10 @@ kernel void kernel_restore_block_q4_1_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK4_1;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_dm_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -491,6 +507,10 @@ kernel void kernel_convert_block_q5_0_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK5_0;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -536,6 +556,10 @@ kernel void kernel_restore_block_q5_0_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK5_0;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -583,6 +607,10 @@ kernel void kernel_convert_block_q5_1_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK5_1;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -630,6 +658,10 @@ kernel void kernel_restore_block_q5_1_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK5_1;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -679,6 +711,10 @@ kernel void kernel_convert_block_q4_k_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_K;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01     + i02 * ne00_blk * ne01;
@@ -732,6 +768,10 @@ kernel void kernel_restore_block_q4_k_trans4_ns(
     uint i01 = get_global_id(0);  // row index
     uint i02 = get_global_id(2);  // batch index
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_K;
 
     uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -784,6 +824,10 @@ kernel void kernel_convert_block_q5_k_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_K;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01     + i02 * ne00_blk * ne01;
@@ -850,6 +894,10 @@ kernel void kernel_restore_block_q5_k_trans4_ns(
     uint i01 = get_global_id(0);  // row index
     uint i02 = get_global_id(2);  // batch index
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_K;
 
     uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -916,6 +964,10 @@ kernel void kernel_convert_block_q6_k_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_K;
 
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
@@ -993,6 +1045,10 @@ kernel void kernel_restore_block_q6_k_trans4_ns(
     uint i01 = get_global_id(0);  // row index
     uint i02 = get_global_id(2);  // batch index
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_K;
 
     uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -1147,6 +1203,10 @@ kernel void kernel_convert_block_mxfp4_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_MXFP4;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -1190,6 +1250,10 @@ kernel void kernel_restore_block_mxfp4_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_MXFP4;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
diff --git a/cpp/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl b/cpp/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl
index e404f392..02cdbdd9 100644
--- a/cpp/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl
@@ -163,7 +163,7 @@ kernel void kernel_gemm_moe_mxfp4_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -248,6 +248,10 @@ kernel void kernel_gemm_moe_mxfp4_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/cpp/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl b/cpp/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
index 02290c17..d403ed0c 100644
--- a/cpp/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
@@ -115,7 +115,7 @@ kernel void kernel_gemm_moe_q4_0_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -198,6 +198,10 @@ kernel void kernel_gemm_moe_q4_0_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/cpp/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl b/cpp/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
index e2574ae0..b2bddf3f 100644
--- a/cpp/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
@@ -116,7 +116,7 @@ kernel void kernel_gemm_moe_q4_1_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -200,6 +200,10 @@ kernel void kernel_gemm_moe_q4_1_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/cpp/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl b/cpp/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl
index 9d24aff6..ab8228d1 100644
--- a/cpp/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl
@@ -133,7 +133,7 @@ kernel void kernel_gemm_moe_q4_k_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -225,6 +225,10 @@ kernel void kernel_gemm_moe_q4_k_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load post router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/cpp/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl b/cpp/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
index 3524cb1b..d1a35d58 100644
--- a/cpp/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
@@ -116,7 +116,7 @@ kernel void kernel_gemm_moe_q5_0_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -202,6 +202,10 @@ kernel void kernel_gemm_moe_q5_0_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/cpp/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl b/cpp/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
index 5fc2a523..90d345ec 100644
--- a/cpp/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
@@ -117,7 +117,7 @@ kernel void kernel_gemm_moe_q5_1_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -204,6 +204,10 @@ kernel void kernel_gemm_moe_q5_1_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/cpp/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl b/cpp/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl
index 808a0c7d..13c26f6f 100644
--- a/cpp/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl
@@ -134,7 +134,7 @@ kernel void kernel_gemm_moe_q5_k_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -230,6 +230,10 @@ kernel void kernel_gemm_moe_q5_k_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load post router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/cpp/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl b/cpp/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl
index a040335a..85ccebec 100644
--- a/cpp/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl
@@ -117,7 +117,7 @@ kernel void kernel_gemm_moe_q6_k_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -209,6 +209,10 @@ kernel void kernel_gemm_moe_q6_k_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load post router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/cpp/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl b/cpp/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl
index e4b44c1a..75129e20 100644
--- a/cpp/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl
@@ -82,6 +82,10 @@ __kernel void kernel_gemv_moe_mxfp4_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/cpp/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl b/cpp/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
index 6f4d3f53..2d28db63 100644
--- a/cpp/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
@@ -37,6 +37,10 @@ __kernel void kernel_gemv_moe_q4_0_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/cpp/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl b/cpp/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
index 3739a215..b98bdc0f 100644
--- a/cpp/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
@@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q4_1_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/cpp/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl b/cpp/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl
index 13d79f25..12464e98 100644
--- a/cpp/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl
@@ -54,6 +54,10 @@ __kernel void kernel_gemv_moe_q4_k_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/cpp/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl b/cpp/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
index 938054cf..b4361363 100644
--- a/cpp/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
@@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q5_0_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/cpp/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl b/cpp/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
index f33a4ef2..7a666006 100644
--- a/cpp/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
@@ -39,6 +39,10 @@ __kernel void kernel_gemv_moe_q5_1_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/cpp/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl b/cpp/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl
index f128d443..7d868d7a 100644
--- a/cpp/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl
@@ -55,6 +55,10 @@ __kernel void kernel_gemv_moe_q5_k_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/cpp/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl b/cpp/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl
index 526e609d..c166bad5 100644
--- a/cpp/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl
+++ b/cpp/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl
@@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q6_k_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/cpp/llama-chat.cpp b/cpp/llama-chat.cpp
index 6554a89b..f1039774 100644
--- a/cpp/llama-chat.cpp
+++ b/cpp/llama-chat.cpp
@@ -73,7 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "hunyuan-moe",       LLM_CHAT_TEMPLATE_HUNYUAN_MOE       },
     { "gpt-oss",           LLM_CHAT_TEMPLATE_OPENAI_MOE        },
     { "hunyuan-dense",     LLM_CHAT_TEMPLATE_HUNYUAN_DENSE     },
-    { "hunyuan-ocr",       LLM_CHAT_TEMPLATE_HUNYUAN_OCR       },
+    { "hunyuan-vl",        LLM_CHAT_TEMPLATE_HUNYUAN_VL        },
     { "kimi-k2",           LLM_CHAT_TEMPLATE_KIMI_K2           },
     { "seed_oss",          LLM_CHAT_TEMPLATE_SEED_OSS          },
     { "grok-2",            LLM_CHAT_TEMPLATE_GROK_2            },
@@ -218,7 +218,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
     } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
         return LLM_CHAT_TEMPLATE_OPENAI_MOE;
     } else if (tmpl_contains("<｜hy_Assistant｜>") && tmpl_contains("<｜hy_begin▁of▁sentence｜>")) {
-        return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
+        return LLM_CHAT_TEMPLATE_HUNYUAN_VL;
     } else if (tmpl_contains("<｜hy_Assistant｜>") && tmpl_contains("<｜hy_place▁holder▁no▁3｜>")) {
         return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
     } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
@@ -825,8 +825,8 @@ int32_t llm_chat_apply_template(
                 ss << "<｜hy_User｜>" << chat[i]->content << "<｜hy_Assistant｜>";
             }
         }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
-        // tencent/HunyuanOCR
+    } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_VL) {
+        // tencent/HunyuanOCR & tencent/HunyuanVL
         ss << "<｜hy_begin▁of▁sentence｜>";
         for (size_t i = 0; i < chat.size(); i++) {
             std::string role(chat[i]->role);
diff --git a/cpp/llama-chat.h b/cpp/llama-chat.h
index 13f936a9..ea6540c0 100644
--- a/cpp/llama-chat.h
+++ b/cpp/llama-chat.h
@@ -53,7 +53,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
     LLM_CHAT_TEMPLATE_OPENAI_MOE,
     LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
-    LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
+    LLM_CHAT_TEMPLATE_HUNYUAN_VL,
     LLM_CHAT_TEMPLATE_KIMI_K2,
     LLM_CHAT_TEMPLATE_SEED_OSS,
     LLM_CHAT_TEMPLATE_GROK_2,
diff --git a/cpp/llama-context.cpp b/cpp/llama-context.cpp
index 09ecb6e4..c839fdc2 100644
--- a/cpp/llama-context.cpp
+++ b/cpp/llama-context.cpp
@@ -1137,6 +1137,19 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
 
     LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
 
+    if (sampler && model.split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
+        static bool warned = false;
+        if (!warned) {
+            LLAMA_LOG_WARN("%s: backend sampling not supported with SPLIT_MODE_TENSOR; using CPU\n", __func__);
+            warned = true;
+        }
+        if (sampling.samplers.count(seq_id) > 0) {
+            sched_need_reserve = true;
+        }
+        sampling.samplers.erase(seq_id);
+        return false;
+    }
+
     const bool can_offload =
         sampler &&
         sampler->iface->backend_init &&
diff --git a/cpp/llama-graph.cpp b/cpp/llama-graph.cpp
index f8b64a2c..ddce7b3b 100644
--- a/cpp/llama-graph.cpp
+++ b/cpp/llama-graph.cpp
@@ -500,15 +500,21 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
 }
 
 void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
-    mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
-    mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+    // base tensors may not be allocated if there are no non-SWA attention layers
+    if (self_k_idxs && self_k_idxs->buffer) {
+        mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
+        mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
 
-    mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+        mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    }
 
-    mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
-    mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+    // swa tensors may not be allocated if there are no SWA attention layers
+    if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
+        mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
+        mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
 
-    mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+        mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+    }
 
     if (self_k_rot) {
         mctx->get_base()->set_input_k_rot(self_k_rot);
@@ -534,14 +540,21 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
 
     bool res = true;
 
-    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
-  //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+    // base tensors may not be allocated if there are no non-SWA attention layers
+    if (self_k_idxs && self_k_idxs->buffer) {
+        res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+      //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
 
-    res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
-  //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+        res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
+    }
 
-    res &= can_reuse_kq_mask(self_kq_mask,     mctx->get_base(), params.ubatch, params.cparams);
-    res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(),  params.ubatch, params.cparams);
+    // swa tensors may not be allocated if there are no SWA attention layers
+    if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
+        res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
+      //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+        res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
+    }
 
     return res;
 }
diff --git a/cpp/llama-model.cpp b/cpp/llama-model.cpp
index 9cc2c7f2..be752717 100644
--- a/cpp/llama-model.cpp
+++ b/cpp/llama-model.cpp
@@ -1334,6 +1334,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
             if (!layer.ssm_beta_s && layer.ssm_beta) {
                 layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
             }
+            if (!layer.nextn.eh_proj_s && layer.nextn.eh_proj) {
+                layer.nextn.eh_proj_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.nextn.shared_head_head_s && layer.nextn.shared_head_head) {
+                layer.nextn.shared_head_head_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
 
             // input scales
             if (!layer.wq_in_s && layer.wq) {
@@ -1393,6 +1399,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
             if (!layer.ssm_beta_in_s && layer.ssm_beta) {
                 layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
             }
+            if (!layer.nextn.eh_proj_in_s && layer.nextn.eh_proj) {
+                layer.nextn.eh_proj_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.nextn.shared_head_head_in_s && layer.nextn.shared_head_head) {
+                layer.nextn.shared_head_head_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
         }
         // output scales
         if (output && output->type == LM_GGML_TYPE_NVFP4) {
diff --git a/cpp/llama-model.h b/cpp/llama-model.h
index 65fdd053..19a62c25 100644
--- a/cpp/llama-model.h
+++ b/cpp/llama-model.h
@@ -202,12 +202,16 @@ struct llama_layer_shortconv {
 };
 
 struct llama_layer_nextn {
-    struct lm_ggml_tensor * eh_proj          = nullptr;
-    struct lm_ggml_tensor * embed_tokens     = nullptr;
-    struct lm_ggml_tensor * enorm            = nullptr;
-    struct lm_ggml_tensor * hnorm            = nullptr;
-    struct lm_ggml_tensor * shared_head_head = nullptr;
-    struct lm_ggml_tensor * shared_head_norm = nullptr;
+    struct lm_ggml_tensor * eh_proj               = nullptr;
+    struct lm_ggml_tensor * eh_proj_s             = nullptr;
+    struct lm_ggml_tensor * eh_proj_in_s          = nullptr;
+    struct lm_ggml_tensor * embed_tokens          = nullptr;
+    struct lm_ggml_tensor * enorm                 = nullptr;
+    struct lm_ggml_tensor * hnorm                 = nullptr;
+    struct lm_ggml_tensor * shared_head_head      = nullptr;
+    struct lm_ggml_tensor * shared_head_head_s    = nullptr;
+    struct lm_ggml_tensor * shared_head_head_in_s = nullptr;
+    struct lm_ggml_tensor * shared_head_norm      = nullptr;
 };
 
 struct llama_layer {
diff --git a/cpp/llama-vocab.cpp b/cpp/llama-vocab.cpp
index 55d6de38..c84b3076 100644
--- a/cpp/llama-vocab.cpp
+++ b/cpp/llama-vocab.cpp
@@ -530,6 +530,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
 struct llm_tokenizer_bpe_session {
     llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
 
+    virtual ~llm_tokenizer_bpe_session() = default;
+
     static void append(const llama_token token_id, std::vector<llama_token> & output)  {
         output.push_back(token_id);
     }
@@ -567,7 +569,7 @@ struct llm_tokenizer_bpe_session {
         }
     }
 
-    void tokenize(const std::string & text, std::vector<llama_token> & output) {
+    virtual void tokenize(const std::string & text, std::vector<llama_token> & output) {
         int final_prev_index = -1;
         const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
 
@@ -1579,6 +1581,88 @@ struct llm_tokenizer_plamo2_session {
     const llm_tokenizer_plamo2 & tokenizer;
 };
 
+// reserved suffix (U+E000) that keeps DNA k-mers distinct from identical
+// base-vocab BPE tokens (e.g. CCCCCC) in token_to_id; erased from id_to_token
+// text at load
+static const std::string dna_kmer_marker = "\xee\x80\x80";
+
+struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
+    llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
+
+    void tokenize(const std::string & text, std::vector<llama_token> & output) override {
+        static const std::string open_tag  = "<dna>";
+        static const std::string close_tag = "</dna>";
+
+        const auto dna_begin_id = vocab.text_to_token(open_tag);
+        const auto dna_end_id   = vocab.text_to_token(close_tag);
+        const auto dna_oov_id   = vocab.text_to_token("<oov>");
+
+        // Fall back to plain BPE if the DNA pieces aren't in the vocab.
+        if (dna_begin_id == LLAMA_TOKEN_NULL || dna_end_id == LLAMA_TOKEN_NULL || dna_oov_id == LLAMA_TOKEN_NULL) {
+            llm_tokenizer_bpe_session::tokenize(text, output);
+            return;
+        }
+
+        const size_t k = 6;
+        size_t pos = 0;
+
+        while (pos < text.size()) {
+            const size_t start = text.find(open_tag, pos);
+            if (start == std::string::npos) {
+                if (pos < text.size()) {
+                    llm_tokenizer_bpe_session::tokenize(text.substr(pos), output);
+                }
+                break;
+            }
+            if (start > pos) {
+                llm_tokenizer_bpe_session::tokenize(text.substr(pos, start - pos), output);
+            }
+            output.push_back(dna_begin_id);
+
+            const size_t content_start = start + open_tag.size();
+            const size_t end           = text.find(close_tag, content_start);
+            const size_t content_end   = (end == std::string::npos) ? text.size() : end;
+
+            emit_dna_kmers(text.substr(content_start, content_end - content_start), k, dna_oov_id, output);
+
+            if (end == std::string::npos) {
+                break;
+            }
+            output.push_back(dna_end_id);
+            pos = end + close_tag.size();
+        }
+    }
+
+private:
+    void emit_dna_kmers(const std::string & raw, size_t k, llama_token oov_id, std::vector<llama_token> & output) {
+        std::string seq = raw;
+        for (char & c : seq) {
+            if (c >= 'a' && c <= 'z') {
+                c = char(c - 32);
+            }
+        }
+
+        // k-mers carry the reserved marker suffix; a non-ACGT k-mer simply
+        // isn't in the vocab and falls back to <oov>
+        auto kmer_token = [&](const std::string & kmer) {
+            const auto tok = vocab.text_to_token(kmer + dna_kmer_marker);
+            return tok != LLAMA_TOKEN_NULL ? tok : oov_id;
+        };
+
+        size_t i = 0;
+        for (; i + k <= seq.size(); i += k) {
+            output.push_back(kmer_token(seq.substr(i, k)));
+        }
+        if (i < seq.size()) {
+            std::string kmer = seq.substr(i);
+            kmer.append(k - kmer.size(), 'A');
+            output.push_back(kmer_token(kmer));
+        }
+    }
+
+    const llama_vocab & vocab;
+};
+
 //
 // impl
 //
@@ -1808,7 +1892,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             special_mask_id = 103;
 
             add_sep = true;
-        } else if (tokenizer_model == "gpt2") {
+        } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") {
             type = LLAMA_VOCAB_TYPE_BPE;
 
             // read bpe merges and populate bpe ranks
@@ -2266,6 +2350,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
     }
     LM_GGML_ASSERT(id_to_token.size() == token_to_id.size());
 
+    // hybriddna: the marker suffix kept k-mer ids distinct in token_to_id; erase
+    // it from id_to_token so the k-mers detokenize to the bare DNA sequence. The
+    // k-mers are the block right after <oov>, so only scan from there.
+    if (tokenizer_model == "hybriddna") {
+        const auto idx = token_to_id.find("<oov>");
+        if (idx != token_to_id.end()) {
+            auto it = id_to_token.begin() + idx->second + 1;
+            for (; it != id_to_token.end(); ++it) {
+                std::string & text = it->text;
+                if (text.size() > dna_kmer_marker.size()
+                        && text.compare(text.size() - dna_kmer_marker.size(), dna_kmer_marker.size(), dna_kmer_marker) == 0) {
+                    text.erase(text.size() - dna_kmer_marker.size());
+                }
+            }
+        }
+    }
+
     init_tokenizer(type);
 
     // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
@@ -3144,11 +3245,19 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
             } break;
         case LLAMA_VOCAB_TYPE_BPE:
             {
-                llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
                 // it calls some other methods that are not exist in llm_tokenizer,
                 // here just cast it to bpe tokenizer object
+                const llm_tokenizer_bpe * tok_bpe = static_cast<const llm_tokenizer_bpe *>(tokenizer.get());
+
+                std::unique_ptr<llm_tokenizer_bpe_session> session;
+                if (vocab.get_tokenizer_model() == "hybriddna") {
+                    session = std::make_unique<llm_tokenizer_hybriddna_session>(vocab, *tok_bpe);
+                } else {
+                    session = std::make_unique<llm_tokenizer_bpe_session>(vocab, *tok_bpe);
+                }
+
                 if (add_special) {
-                    session.append_bos(output);
+                    session->append_bos(output);
                 }
                 for (const auto & fragment : fragment_buffer) {
                     if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@@ -3161,15 +3270,15 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
 #ifdef PRETOKENIZERDEBUG
                         LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
 #endif
-                        session.tokenize(text, output);
+                        session->tokenize(text, output);
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                        session.append(fragment.token, output);
+                        session->append(fragment.token, output);
                     }
                 }
 
                 if (add_special) {
-                    session.append_eos(output);
-                    session.check_double_bos_eos(output);
+                    session->append_eos(output);
+                    session->check_double_bos_eos(output);
                 }
             } break;
         case LLAMA_VOCAB_TYPE_WPM:
diff --git a/cpp/models/qwen35.cpp b/cpp/models/qwen35.cpp
index 8c78cd17..8bab76a5 100644
--- a/cpp/models/qwen35.cpp
+++ b/cpp/models/qwen35.cpp
@@ -525,8 +525,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
 
     res->add_input(std::move(inp));
 
-    lm_ggml_tensor * inp_pos = build_inp_pos();
-    auto * inp_attn       = build_attn_inp_kv();
+    lm_ggml_tensor * inp_pos     = build_inp_pos();
+    lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
+    auto * inp_attn           = build_attn_inp_kv();
 
     lm_ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
     cb(h_norm, "mtp_hnorm", il);
@@ -537,7 +538,7 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     lm_ggml_tensor * concat = lm_ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
     cb(concat, "mtp_concat", il);
 
-    lm_ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
+    lm_ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s);
     cb(cur, "mtp_eh_proj", il);
 
     lm_ggml_tensor * inpSA = cur;
@@ -615,6 +616,8 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     cb(cur, "h_pre_norm", -1);
     res->t_h_pre_norm = cur;
 
+    cur   = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
+
     lm_ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
             ? layer.nextn.shared_head_norm
             : model.output_norm;
@@ -623,8 +626,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     cb(cur, "mtp_shared_head_norm", -1);
 
     lm_ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
+    lm_ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s;
     LM_GGML_ASSERT(head_w && "QWEN35 MTP: missing LM head (nextn.shared_head_head or model.output)");
-    cur = build_lora_mm(head_w, cur);
+    cur = build_lora_mm(head_w, cur, head_s);
     cb(cur, "result_output", -1);
 
     res->t_logits = cur;
diff --git a/cpp/models/qwen35moe.cpp b/cpp/models/qwen35moe.cpp
index ac52e279..e4bdf7e4 100644
--- a/cpp/models/qwen35moe.cpp
+++ b/cpp/models/qwen35moe.cpp
@@ -588,8 +588,10 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
 
     res->add_input(std::move(inp));
 
-    lm_ggml_tensor * inp_pos = build_inp_pos();
-    auto * inp_attn       = build_attn_inp_kv();
+    lm_ggml_tensor * inp_pos     = build_inp_pos();
+    lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
+    auto * inp_attn           = build_attn_inp_kv();
+
 
     lm_ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
     cb(h_norm, "mtp_hnorm", il);
@@ -600,7 +602,7 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     lm_ggml_tensor * concat = lm_ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
     cb(concat, "mtp_concat", il);
 
-    lm_ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
+    lm_ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s);
     cb(cur, "mtp_eh_proj", il);
 
     lm_ggml_tensor * inpSA = cur;
@@ -710,6 +712,8 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     cb(cur, "h_pre_norm", -1);
     res->t_h_pre_norm = cur;
 
+    cur   = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
+
     lm_ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
             ? layer.nextn.shared_head_norm
             : model.output_norm;
@@ -718,8 +722,9 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     cb(cur, "mtp_shared_head_norm", -1);
 
     lm_ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
+    lm_ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s;
     LM_GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)");
-    cur = build_lora_mm(head_w, cur);
+    cur = build_lora_mm(head_w, cur, head_s);
     cb(cur, "result_output", -1);
 
     res->t_logits = cur;
diff --git a/cpp/tools/mtmd/clip-impl.h b/cpp/tools/mtmd/clip-impl.h
index 3c536529..649b4c10 100644
--- a/cpp/tools/mtmd/clip-impl.h
+++ b/cpp/tools/mtmd/clip-impl.h
@@ -170,7 +170,7 @@
 #define TN_TOK_BOI         "v.boi"
 #define TN_TOK_EOI         "v.eoi"
 
-// hunyuanocr / hunyuanvl (shared GGUF tensor names)
+// hunyuanvl (shared GGUF tensor names)
 #define TN_MM_PRE_NORM     "mm.pre_norm.%s"
 #define TN_TOK_IMG_BEGIN   "mm.image_begin"
 #define TN_TOK_IMG_END     "mm.image_end"
@@ -343,7 +343,6 @@ enum projector_type {
     PROJECTOR_TYPE_YASA2,
     PROJECTOR_TYPE_KIMIK25,
     PROJECTOR_TYPE_NEMOTRON_V2_VL,
-    PROJECTOR_TYPE_HUNYUANOCR,
     PROJECTOR_TYPE_HUNYUANVL,
     PROJECTOR_TYPE_MINICPMV4_6,
     PROJECTOR_TYPE_GRANITE_SPEECH,
@@ -393,7 +392,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_YASA2,     "yasa2"},
     { PROJECTOR_TYPE_KIMIK25,   "kimik25"},
     { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
-    { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
     { PROJECTOR_TYPE_HUNYUANVL,  "hunyuanvl"},
     { PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"},
     { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"},
diff --git a/cpp/tools/mtmd/clip-model.h b/cpp/tools/mtmd/clip-model.h
index 52283959..3c7235d4 100644
--- a/cpp/tools/mtmd/clip-model.h
+++ b/cpp/tools/mtmd/clip-model.h
@@ -35,6 +35,16 @@ enum resize_algo {
     // RESIZE_ALGO_LANCZOS, // TODO
 };
 
+// Padding style for img_tool::resize
+//   PAD_NONE    - no padding; direct resize to target dimensions
+//   PAD_CEIL    - aspect-preserving pad (default)
+//   PAD_NEAREST - aspect-preserving pad with nearest-integer rounding (Pillow byte-parity)
+enum pad_style {
+    PAD_NONE,
+    PAD_CEIL,
+    PAD_NEAREST,
+};
+
 struct clip_hparams {
     int32_t image_size = 0;
     int32_t patch_size = 0;
@@ -52,7 +62,7 @@ struct clip_hparams {
     int32_t image_min_pixels = -1;
     int32_t image_max_pixels = -1;
     resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
-    bool image_resize_pad = true; // if false, center-crop will be applied when resizing
+    pad_style image_resize_pad = PAD_CEIL; // padding style when resizing
     std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
 
     // (preprocessor) for llava-uhd style models
@@ -61,8 +71,8 @@ struct clip_hparams {
     int32_t preproc_max_tiles = 0;
     resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
     resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
-    bool image_pad_rf = true;  // if true, refined image will be padded (e.g. llava-1.6)
-    bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
+    pad_style image_pad_rf = PAD_CEIL;  // padding style for the refined image (e.g. llava-1.6)
+    pad_style image_pad_ov = PAD_NONE;  // padding style for the overview image (e.g. llava-1.6)
     std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
     std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
 
@@ -510,7 +520,7 @@ struct clip_model {
     lm_ggml_tensor * mm_boi = nullptr;
     lm_ggml_tensor * mm_eoi = nullptr;
 
-    // hunyuanocr perceiver
+    // hunyuanvl perceiver
     lm_ggml_tensor * mm_pre_norm_w  = nullptr;
     lm_ggml_tensor * mm_img_begin   = nullptr;
     lm_ggml_tensor * mm_img_end     = nullptr;
diff --git a/cpp/tools/mtmd/clip.cpp b/cpp/tools/mtmd/clip.cpp
index bbb8fa64..84021c4a 100644
--- a/cpp/tools/mtmd/clip.cpp
+++ b/cpp/tools/mtmd/clip.cpp
@@ -936,10 +936,9 @@ static lm_ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_
             {
                 builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
             } break;
-        case PROJECTOR_TYPE_HUNYUANOCR:
         case PROJECTOR_TYPE_HUNYUANVL:
             {
-                builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
+                builder = std::make_unique<clip_graph_hunyuanvl>(ctx, img);
             } break;
         case PROJECTOR_TYPE_MLP:
         case PROJECTOR_TYPE_MLP_NORM:
@@ -1233,12 +1232,12 @@ struct clip_model_loader {
                         hparams.has_llava_projector = model.proj_type != PROJECTOR_TYPE_COGVLM;
                         hparams.image_pad_color     = {122, 116, 104};
                         if (!hparams.image_res_candidates.empty()) {
-                            hparams.image_resize_pad  = true;
+                            hparams.image_resize_pad  = PAD_CEIL;
                             hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                         } else {
                             // llava-1.6 default params
-                            hparams.image_pad_ov         = false;
-                            hparams.image_pad_rf         = true;
+                            hparams.image_pad_ov         = PAD_NONE;
+                            hparams.image_pad_rf         = PAD_CEIL;
                             hparams.image_pad_color_rf   = {122, 116, 104};
                             hparams.image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
                             hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
@@ -1246,7 +1245,7 @@ struct clip_model_loader {
                     } break;
                 case PROJECTOR_TYPE_GLM_EDGE:
                     {
-                        hparams.image_resize_pad  = true;
+                        hparams.image_resize_pad  = PAD_CEIL;
                         hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                     } break;
                 case PROJECTOR_TYPE_MINICPMV:
@@ -1441,7 +1440,7 @@ struct clip_model_loader {
                     {
                         hparams.n_merge = 2;
                         hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
-                        hparams.image_resize_pad  = false;
+                        hparams.image_resize_pad  = PAD_NONE;
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                         get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
                         std::vector<int> wa_layer_indexes_vec;
@@ -1461,7 +1460,7 @@ struct clip_model_loader {
 
                         // reka model performs better when using resize_bicubic, which stretches
                         // the image to fit fixed square size
-                        hparams.image_resize_pad = false;
+                        hparams.image_resize_pad = PAD_NONE;
                     } break;
                 case PROJECTOR_TYPE_GLM4V:
                     {
@@ -1516,31 +1515,23 @@ struct clip_model_loader {
                         hparams.image_size = 1024;
                         hparams.warmup_image_size = 1024;
                         hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
-                        hparams.image_pad_color[0] = hparams.image_mean[0];
-                        hparams.image_pad_color[1] = hparams.image_mean[1];
-                        hparams.image_pad_color[2] = hparams.image_mean[2];
+                        hparams.image_pad_color = {127, 127, 127};
 
                         get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true);
                         get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
                         get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
                         get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
                      } break;
-                case PROJECTOR_TYPE_HUNYUANOCR:
-                    {
-                        hparams.n_merge = 2;
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
-                        get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
-                        get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
-                        hparams.set_warmup_n_tokens(28*28);
-                    } break;
                 case PROJECTOR_TYPE_HUNYUANVL:
                     {
                         hparams.n_merge = 2;
                         hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
-                        hparams.image_resize_pad = false;
+                        hparams.image_resize_pad = PAD_NONE;
                         hparams.ffn_op = FFN_GELU;
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                         hparams.set_limit_image_tokens(256, 16384);
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels, false);
+                        get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels, false);
                         hparams.set_warmup_n_tokens(32*32);
                     } break;
                 case PROJECTOR_TYPE_LFM2A:
@@ -2345,7 +2336,6 @@ struct clip_model_loader {
                     model.mm_boi            = get_tensor(TN_TOK_BOI);
                     model.mm_eoi            = get_tensor(TN_TOK_EOI);
                 } break;
-            case PROJECTOR_TYPE_HUNYUANOCR:
             case PROJECTOR_TYPE_HUNYUANVL:
                 {
                     // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
@@ -3073,7 +3063,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
         case PROJECTOR_TYPE_MIMOVL:
         case PROJECTOR_TYPE_GLM4V:
         case PROJECTOR_TYPE_PADDLEOCR:
-        case PROJECTOR_TYPE_HUNYUANOCR:
         case PROJECTOR_TYPE_HUNYUANVL:
         case PROJECTOR_TYPE_YOUTUVL:
             return (img->nx / params.patch_size) / 2;
@@ -3290,7 +3279,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
             int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
             n_patches = h * (h + 1) + 1;
         } break;
-        case PROJECTOR_TYPE_HUNYUANOCR:
         case PROJECTOR_TYPE_HUNYUANVL:
             {
                 int merge = ctx->model.hparams.n_merge;
@@ -3926,7 +3914,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         case PROJECTOR_TYPE_JANUS_PRO:
         case PROJECTOR_TYPE_PHI4:
         case PROJECTOR_TYPE_COGVLM:
-        case PROJECTOR_TYPE_HUNYUANOCR:
         case PROJECTOR_TYPE_YASA2:
             {
                 // do nothing
@@ -3936,7 +3923,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 // Compute the HunyuanVL 2D position embedding on CPU (with the
                 // custom sf=(target+0.1)/n_grid bilinear sampling that the
                 // reference implementation uses) and upload it to the graph
-                // input declared in clip_graph_hunyuanocr::build().
+                // input declared in clip_graph_hunyuanvl::build().
                 LM_GGML_ASSERT(model.position_embeddings != nullptr);
                 lm_ggml_tensor * src_t   = model.position_embeddings;
                 const int64_t n_embd  = src_t->ne[0];
@@ -4257,7 +4244,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         case PROJECTOR_TYPE_KIMIK25:
         case PROJECTOR_TYPE_YASA2:
             return ctx->model.mm_2_w->ne[1];
-        case PROJECTOR_TYPE_HUNYUANOCR:
         case PROJECTOR_TYPE_HUNYUANVL:
             return ctx->model.mm_model_proj->ne[1];
         case PROJECTOR_TYPE_COGVLM:
diff --git a/cpp/tools/mtmd/models/deepseekocr.cpp b/cpp/tools/mtmd/models/deepseekocr.cpp
index d38be24a..bef593be 100644
--- a/cpp/tools/mtmd/models/deepseekocr.cpp
+++ b/cpp/tools/mtmd/models/deepseekocr.cpp
@@ -88,165 +88,169 @@ static lm_ggml_tensor * get_rel_pos(lm_ggml_context * ctx0,
     return cur;  // [C, k_size, q_size]
 }
 
-lm_ggml_cgraph * clip_graph_deepseekocr::build() {
-    // patch embedding
-    lm_ggml_tensor * inp_raw = build_inp_raw();
 
-    lm_ggml_tensor * sam_out;
+lm_ggml_tensor * clip_graph_deepseekocr::build_sam(lm_ggml_tensor * inp_raw) {
     // Building SAM
-    {
-        const int n_embd  = hparams.sam_n_embd;
-        const int n_layer = hparams.sam_n_layer;
-        const int n_heads = hparams.sam_n_head;
-        const int d_heads = n_embd / n_heads;
-        const int window  = hparams.attn_window_size;
-
-        lm_ggml_tensor * inpL;
-
-        inpL = lm_ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
-        inpL = lm_ggml_add(ctx0, inpL, lm_ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
-        inpL = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, inpL, 1, 2, 0, 3));
-
-        lm_ggml_tensor * rel_pos_indices_local;
-        lm_ggml_tensor * rel_pos_indices_global;
-
-        rel_pos_indices_local  = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_I32, window, window);
-        rel_pos_indices_global = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
-        lm_ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
-        lm_ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
-        lm_ggml_set_input(rel_pos_indices_local);
-        lm_ggml_set_input(rel_pos_indices_global);
-
-        lm_ggml_tensor * cur;
-        const auto    tgt_size = inpL->ne[1];
-        const auto    str_size = model.pos_embed->ne[1];
-
-        if (str_size != tgt_size) {
-            lm_ggml_tensor * old_pos_embed = nullptr;
-            old_pos_embed               = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
-            lm_ggml_tensor * new_pos_embed =
-                lm_ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, LM_GGML_SCALE_MODE_BICUBIC);
-            new_pos_embed = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
-            cur           = lm_ggml_add(ctx0, inpL, new_pos_embed);
-        } else {
-            cur = lm_ggml_add(ctx0, inpL, model.pos_embed);
-        }
+    const int n_embd  = hparams.sam_n_embd;
+    const int n_layer = hparams.sam_n_layer;
+    const int n_heads = hparams.sam_n_head;
+    const int d_heads = n_embd / n_heads;
+    const int window  = hparams.attn_window_size;
 
-        // loop over layers
-        for (int il = 0; il < n_layer; il++) {
-            auto &        layer    = model.sam_layers[il];
-            lm_ggml_tensor * shortcut = cur;
+    lm_ggml_tensor * inpL;
 
-            // layernorm1
-            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+    inpL = lm_ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
+    inpL = lm_ggml_add(ctx0, inpL, lm_ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
+    inpL = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, inpL, 1, 2, 0, 3));
 
-            const int64_t w0 = cur->ne[1];
-            const int64_t h0 = cur->ne[2];
+    lm_ggml_tensor * rel_pos_indices_local;
+    lm_ggml_tensor * rel_pos_indices_global;
 
-            lm_ggml_tensor * indices;
+    rel_pos_indices_local  = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_I32, window, window);
+    rel_pos_indices_global = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
+    lm_ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
+    lm_ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
+    lm_ggml_set_input(rel_pos_indices_local);
+    lm_ggml_set_input(rel_pos_indices_global);
 
-            if (hparams.is_global_attn(il)) {
-                indices = rel_pos_indices_global;
-            } else {
-                // local attention layer - apply window partition
-                cur     = window_partition(ctx0, cur, window);
-                indices = rel_pos_indices_local;
-            }
+    lm_ggml_tensor * cur;
+    const auto    tgt_size = inpL->ne[1];
+    const auto    str_size = model.pos_embed->ne[1];
+
+    if (str_size != tgt_size) {
+        lm_ggml_tensor * old_pos_embed = nullptr;
+        old_pos_embed               = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
+        lm_ggml_tensor * new_pos_embed =
+            lm_ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, LM_GGML_SCALE_MODE_BICUBIC);
+        new_pos_embed = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
+        cur           = lm_ggml_add(ctx0, inpL, new_pos_embed);
+    } else {
+        cur = lm_ggml_add(ctx0, inpL, model.pos_embed);
+    }
 
-            const int64_t W = cur->ne[1];
-            const int64_t H = cur->ne[2];
-            // self-attention
-            {
-                const int B = cur->ne[3];
-
-                cur = lm_ggml_mul_mat(ctx0, layer.qkv_w, cur);
-                cur = lm_ggml_add(ctx0, cur, layer.qkv_b);
-                cur = lm_ggml_cont(ctx0, cur);  // Ensure tensor is contiguous before reshape
-                cur = lm_ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
-
-                lm_ggml_tensor * Q;
-                lm_ggml_tensor * K;
-                lm_ggml_tensor * V;
-
-                Q = lm_ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
-                Q = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
-
-                K = lm_ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
-                K = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
-
-                V = lm_ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
-                V = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
-
-                lm_ggml_tensor * mask;
-                lm_ggml_tensor * rw;
-                lm_ggml_tensor * rh;
-                lm_ggml_tensor * qr;
-
-                rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W);  // [W, W, C]
-                rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H);  // [H, H, C]
-                qr = lm_ggml_permute(ctx0, Q, 0, 2, 1, 3);
-                qr = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
-
-                rw   = lm_ggml_mul_mat(ctx0, rw,
-                                    lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, qr, 0, 2, 1, 3)));  // [B*n_heads, W, H, W]
-                rw   = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, rw, 0, 2, 1, 3));                // [B*n_heads, H, W, W]
-                rw   = lm_ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
-                rw   = lm_ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
-                rh   = lm_ggml_mul_mat(ctx0, rh, qr);  // [B*n_heads, H, W, H]
-                rh   = lm_ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
-                mask = lm_ggml_add(ctx0, rw, rh);      // [B*n_heads, H*W, H, W]
-                mask = lm_ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
-                mask = lm_ggml_cast(ctx0, mask, LM_GGML_TYPE_F16);
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        auto &        layer    = model.sam_layers[il];
+        lm_ggml_tensor * shortcut = cur;
 
-                const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
 
-                cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
-                                 il);  // [B, H*W, n_embd]
-                cur = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, cur), n_embd, W, H, B);
-            }
+        const int64_t w0 = cur->ne[1];
+        const int64_t h0 = cur->ne[2];
 
-            if (hparams.is_global_attn(il) == false) {
-                // local attention layer - reverse window partition
-                cur = window_unpartition(ctx0, cur, w0, h0, window);
-            }
+        lm_ggml_tensor * indices;
 
-            // re-add the layer input, e.g., residual
-            cur = lm_ggml_add(ctx0, cur, shortcut);
+        if (hparams.is_global_attn(il)) {
+            indices = rel_pos_indices_global;
+        } else {
+            // local attention layer - apply window partition
+            cur     = window_partition(ctx0, cur, window);
+            indices = rel_pos_indices_local;
+        }
 
-            lm_ggml_tensor * inpFF = cur;
+        const int64_t W = cur->ne[1];
+        const int64_t H = cur->ne[2];
+        // self-attention
+        {
+            const int B = cur->ne[3];
+
+            cur = lm_ggml_mul_mat(ctx0, layer.qkv_w, cur);
+            cur = lm_ggml_add(ctx0, cur, layer.qkv_b);
+            cur = lm_ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
+            cur = lm_ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
+
+            lm_ggml_tensor * Q;
+            lm_ggml_tensor * K;
+            lm_ggml_tensor * V;
+
+            Q = lm_ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
+            Q = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
+
+            K = lm_ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
+            K = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
+
+            V = lm_ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
+            V = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
+
+            lm_ggml_tensor * mask;
+            lm_ggml_tensor * rw;
+            lm_ggml_tensor * rh;
+            lm_ggml_tensor * qr;
+
+            rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C]
+            rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C]
+            qr = lm_ggml_permute(ctx0, Q, 0, 2, 1, 3);
+            qr = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
+
+            rw = lm_ggml_mul_mat(ctx0, rw,
+                              lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W]
+            rw   = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
+            rw   = lm_ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
+            rw   = lm_ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
+            rh   = lm_ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H]
+            rh   = lm_ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
+            mask = lm_ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W]
+            mask = lm_ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
+            // casting mask to F16 only required when flash-attn is enabled
+            if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+                mask = lm_ggml_cast(ctx0, mask, LM_GGML_TYPE_F16);
+            }
 
-            // layernorm2
-            cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+            const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
 
-            // ffn
-            cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
-                            hparams.ffn_op, il);
+            cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
+                             il); // [B, H*W, n_embd]
+            cur = lm_ggml_reshape_4d(ctx0, lm_ggml_cont(ctx0, cur), n_embd, W, H, B);
+        }
 
-            // residual 2
-            cur = lm_ggml_add(ctx0, cur, inpFF);
-            cb(cur, "sam_layer_out", il);
+        if (hparams.is_global_attn(il) == false) {
+            // local attention layer - reverse window partition
+            cur = window_unpartition(ctx0, cur, w0, h0, window);
         }
 
-        cur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, cur, 2, 0, 1, 3));
+        // re-add the layer input, e.g., residual
+        cur = lm_ggml_add(ctx0, cur, shortcut);
 
-        cur = lm_ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
-        cur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, cur, 1, 2, 0, 3));
-        cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
-        cur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, cur, 2, 0, 1, 3));
+        lm_ggml_tensor * inpFF = cur;
 
-        cur = lm_ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
-        cur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, cur, 1, 2, 0, 3));
-        cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
-        cur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, cur, 2, 0, 1, 3));
+        // layernorm2
+        cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
 
-        cur = lm_ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
-        cur = lm_ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
-        cb(cur, "sam_output", -1);
+        // ffn
+        cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
+                        hparams.ffn_op, il);
 
-        lm_ggml_build_forward_expand(gf, cur);
-        sam_out = cur;
+        // residual 2
+        cur = lm_ggml_add(ctx0, cur, inpFF);
+        cb(cur, "sam_layer_out", il);
     }
 
+    cur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, cur, 2, 0, 1, 3));
+
+    cur = lm_ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
+    cur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, cur, 1, 2, 0, 3));
+    cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+    cur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, cur, 2, 0, 1, 3));
+
+    cur = lm_ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
+    cur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, cur, 1, 2, 0, 3));
+    cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+    cur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, cur, 2, 0, 1, 3));
+
+    cur = lm_ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
+    cur = lm_ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
+    cb(cur, "sam_output", -1);
+
+    lm_ggml_build_forward_expand(gf, cur);
+    return cur;
+}
+
+lm_ggml_cgraph * clip_graph_deepseekocr::build() {
+    // patch embedding
+    lm_ggml_tensor * inp_raw = build_inp_raw();
+    lm_ggml_tensor * sam_out = build_sam(inp_raw);
+
     lm_ggml_tensor * clip_out;
     // Building DS-OCR CLIP
     {
diff --git a/cpp/tools/mtmd/models/hunyuanocr.cpp b/cpp/tools/mtmd/models/hunyuanvl.cpp
similarity index 70%
rename from cpp/tools/mtmd/models/hunyuanocr.cpp
rename to cpp/tools/mtmd/models/hunyuanvl.cpp
index cede63b6..3b4c2b60 100644
--- a/cpp/tools/mtmd/models/hunyuanocr.cpp
+++ b/cpp/tools/mtmd/models/hunyuanvl.cpp
@@ -1,25 +1,15 @@
 #include "models.h"
 
-lm_ggml_cgraph * clip_graph_hunyuanocr::build() {
+lm_ggml_cgraph * clip_graph_hunyuanvl::build() {
     const int merge = hparams.n_merge;
     const int pw    = n_patches_x;
     const int ph    = n_patches_y;
 
-    // Position embedding interpolation.
-    // HunyuanVL needs scale factors sf=(target+0.1)/n_grid, which the standard
-    // lm_ggml_interpolate cannot express. To avoid adding a new ggml op, the
-    // resize is computed on CPU in clip_image_batch_encode and uploaded here
-    // as a graph input (named "hunyuanvl_pos_embd").
-    // HunyuanOCR uses the same square layout and the standard ratio-based
-    // interpolation provided by resize_position_embeddings().
-    lm_ggml_tensor * pos_embd = nullptr;
-    if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) {
-        pos_embd = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, ph * pw);
-        lm_ggml_set_name(pos_embd, "hunyuanvl_pos_embd");
-        lm_ggml_set_input(pos_embd);
-    } else {
-        pos_embd = resize_position_embeddings(LM_GGML_SCALE_MODE_BILINEAR);
-    }
+    // position embedding: declared as a graph input, filled on CPU
+    // by clip_image_batch_encode (see PROJECTOR_TYPE_HUNYUANVL branch there).
+    lm_ggml_tensor * pos_embd = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, ph * pw);
+    lm_ggml_set_name(pos_embd, "hunyuanvl_pos_embd");
+    lm_ggml_set_input(pos_embd);
 
     lm_ggml_tensor * inp = build_inp();
     lm_ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
diff --git a/cpp/tools/mtmd/models/models.h b/cpp/tools/mtmd/models/models.h
index 29a490ac..a2cc7b95 100644
--- a/cpp/tools/mtmd/models/models.h
+++ b/cpp/tools/mtmd/models/models.h
@@ -118,6 +118,7 @@ struct clip_graph_whisper_enc : clip_graph {
 struct clip_graph_deepseekocr : clip_graph {
     clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     lm_ggml_cgraph * build() override;
+    lm_ggml_tensor * build_sam(lm_ggml_tensor * inp); // build the SAM model
 };
 
 struct clip_graph_conformer : clip_graph {
@@ -141,8 +142,8 @@ struct clip_graph_glm4v : clip_graph {
     lm_ggml_cgraph * build() override;
 };
 
-struct clip_graph_hunyuanocr : clip_graph {
-    clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+struct clip_graph_hunyuanvl : clip_graph {
+    clip_graph_hunyuanvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     lm_ggml_cgraph * build() override;
 };
 
diff --git a/cpp/tools/mtmd/mtmd-image.cpp b/cpp/tools/mtmd/mtmd-image.cpp
index 79c37a1b..1be566ea 100644
--- a/cpp/tools/mtmd/mtmd-image.cpp
+++ b/cpp/tools/mtmd/mtmd-image.cpp
@@ -38,7 +38,7 @@ struct img_tool {
             clip_image_u8 & dst,
             const clip_image_size & target_resolution,
             resize_algo algo,
-            bool add_padding = true, // TODO: define the behavior for add_padding = false
+            pad_style padding = PAD_CEIL,
             std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
         dst.nx = target_resolution.width;
         dst.ny = target_resolution.height;
@@ -50,7 +50,7 @@ struct img_tool {
             return;
         }
 
-        if (!add_padding) {
+        if (padding == PAD_NONE) {
             // direct resize
             switch (algo) {
                 case RESIZE_ALGO_BILINEAR:
@@ -71,8 +71,15 @@ struct img_tool {
             float scale_w = static_cast<float>(target_resolution.width) / src.nx;
             float scale_h = static_cast<float>(target_resolution.height) / src.ny;
             float scale = std::min(scale_w, scale_h);
-            int new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
-            int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
+
+            int new_width, new_height;
+            if (padding == PAD_NEAREST) {
+                new_width  = std::min(static_cast<int>(std::round(src.nx * scale)), target_resolution.width);
+                new_height = std::min(static_cast<int>(std::round(src.ny * scale)), target_resolution.height);
+            } else {
+                new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
+                new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
+            }
 
             switch (algo) {
                 case RESIZE_ALGO_BILINEAR:
@@ -91,9 +98,14 @@ struct img_tool {
             // fill dst with pad_color
             fill(dst, pad_color);
 
-            int offset_x = (target_resolution.width  - new_width)  / 2;
-            int offset_y = (target_resolution.height - new_height) / 2;
-
+            int offset_x, offset_y;
+            if (padding == PAD_NEAREST) {
+                offset_x = static_cast<int>(std::round((target_resolution.width  - new_width)  / 2.0f));
+                offset_y = static_cast<int>(std::round((target_resolution.height - new_height) / 2.0f));
+            } else {
+                offset_x = (target_resolution.width  - new_width)  / 2;
+                offset_y = (target_resolution.height - new_height) / 2;
+            }
             composite(dst, resized_image, offset_x, offset_y);
         }
     }
@@ -356,10 +368,10 @@ struct img_tool {
             LM_GGML_ASSERT(inSize > 0 && outSize > 0);
             double support, scale, filterscale;
             double center, ww, ss;
-            int xx, x, ksize, xmin, xmax, xcnt;
+            int xx, x, ksize, xmin, xmax;
 
             // Calculate scaling factor: ratio of input range to output size
-            filterscale = scale = (double)inSize / outSize;
+            filterscale = scale = static_cast<double>(inSize) / outSize;
             // For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness
             // For downsampling (scale > 1), widen filter to prevent aliasing
             if (filterscale < 1.0) {
@@ -373,6 +385,7 @@ struct img_tool {
             std::vector<double> pre_weights(outSize * ksize);  // Temporary weights
             bounds.resize(outSize * 2);
 
+
             // For each output pixel, compute its filter coefficients
             for (xx = 0; xx < outSize; xx++) {
                 // Calculate the center position in input space (pixel-center convention: +0.5)
@@ -391,10 +404,10 @@ struct img_tool {
                     xmax = inSize;
                 }
 
-                xcnt = xmax - xmin;
+                xmax -= xmin;
 
                 // Compute filter weights for each contributing input pixel
-                for (x = 0; x < xcnt; x++) {
+                for (x = 0; x < xmax; x++) {
                     // Distance from input pixel center to output pixel center in input space
                     double w = bicubic_filter((x + xmin - center + 0.5) * ss);
                     pre_weights[xx * ksize + x] = w;
@@ -402,7 +415,7 @@ struct img_tool {
                 }
 
                 // Normalize weights to sum to 1.0 (preserves brightness)
-                for (x = 0; x < xcnt; x++) {
+                for (x = 0; x < xmax; x++) {
                     if (ww != 0.0) {
                         pre_weights[xx * ksize + x] /= ww;
                     }
@@ -415,18 +428,27 @@ struct img_tool {
 
                 // Store input pixel range for this output pixel
                 bounds[xx * 2 + 0] = xmin;
-                bounds[xx * 2 + 1] = xcnt;
+                bounds[xx * 2 + 1] = xmax;
             }
 
             // Convert floating-point coefficients to fixed-point integers
             // Formula: int32 = round(float * 2^PRECISION_BITS)
             weights.resize(outSize * ksize);
+
+            const double fxp_scale = std::ldexp(1.0, PRECISION_BITS); // 1.0 * 2^PRECISION_BITS
+
             for (int i = 0; i < outSize * ksize; i++) {
+                double tmp_val = pre_weights[i] * fxp_scale;
                 if (pre_weights[i] < 0) {
-                    weights[i] = static_cast<int32_t>(-0.5 + pre_weights[i] * (1 << PRECISION_BITS));
+                    tmp_val -= 0.5;
                 } else {
-                    weights[i] = static_cast<int32_t>(0.5 + pre_weights[i] * (1 << PRECISION_BITS));
+                    tmp_val += 0.5;
                 }
+                tmp_val = std::round(tmp_val);
+                tmp_val = std::clamp(tmp_val,
+                                     static_cast<double>(std::numeric_limits<int32_t>::min()),
+                                     static_cast<double>(std::numeric_limits<int32_t>::max()));
+                weights[i] = static_cast<int32_t>(tmp_val);
             }
 
             return ksize;
@@ -1083,35 +1105,31 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
 //
 
 bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
-    const std::vector native_resolutions = {
-        /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */
-    };
-    // original image size
-    const clip_image_size original_size{img.nx, img.ny};
-    const int orig_w = original_size.width;
-    const int orig_h = original_size.height;
-    const int orig_area = orig_h * orig_w;
-
-    size_t mode_i = 0;
-    int min_diff = orig_area;
-
-    for (size_t i = 0; i < native_resolutions.size(); i++) {
-        int r = native_resolutions[i];
-        if (std::abs(orig_area - r * r) < min_diff) {
-            mode_i = i;
-            min_diff = std::abs(orig_area - r * r);
+    static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
+    // TODO: support 512 (tiny) and 640 (small) once we have eval data for them
+
+    const int64_t orig_area = static_cast<int64_t>(img.nx) * img.ny;
+
+    size_t  mode_i   = 0;
+    int64_t min_diff = std::numeric_limits<int64_t>::max();
+    for (size_t i = 0; i < std::size(native_resolutions); i++) {
+        const int64_t r    = native_resolutions[i];
+        const int64_t diff = std::abs(orig_area - r * r);
+        if (diff < min_diff) {
+            mode_i   = i;
+            min_diff = diff;
         }
     }
-
-    /* Native Resolution (Base/Large) */
     const int image_size = native_resolutions[mode_i];
 
-    // scaled and padded image
-    clip_image_u8_ptr scaled_img(clip_image_u8_init());
-    img_tool::resize(img, *scaled_img, clip_image_size{image_size, image_size}, hparams.image_resize_algo);
+    // Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for
+    // byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor.
+    clip_image_u8 padded;
+    img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
+                     PAD_NEAREST, hparams.image_pad_color);
 
     clip_image_f32_ptr res(clip_image_f32_init());
-    img_u8_to_f32(*scaled_img, *res, hparams.image_mean, hparams.image_std);
+    img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std);
     output.entries.push_back(std::move(res));
 
     output.grid_x = 1;
@@ -1246,7 +1264,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8
             std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
         };
         clip_image_u8 scaled;
-        img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, false);
+        img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
         resized = std::move(scaled);
     }
 
@@ -1347,7 +1365,7 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip
     clip_image_u8 img_for_crop = prepared;
     if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
         clip_image_u8 refined;
-        img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, false);
+        img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
         img_for_crop = std::move(refined);
     }
 
diff --git a/cpp/tools/mtmd/mtmd.cpp b/cpp/tools/mtmd/mtmd.cpp
index db19fcce..79e9bda8 100644
--- a/cpp/tools/mtmd/mtmd.cpp
+++ b/cpp/tools/mtmd/mtmd.cpp
@@ -493,7 +493,6 @@ struct mtmd_context {
                     img_end = "\n"; // prevent empty batch on llama-server
                     image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                 } break;
-            case PROJECTOR_TYPE_HUNYUANOCR:
             case PROJECTOR_TYPE_HUNYUANVL:
                 {
                     // note: these use fullwidth ｜ (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock
index b4c6cb9b..d52d75ee 100644
--- a/example/ios/Podfile.lock
+++ b/example/ios/Podfile.lock
@@ -8,7 +8,7 @@ PODS:
   - hermes-engine (0.82.0):
     - hermes-engine/Pre-built (= 0.82.0)
   - hermes-engine/Pre-built (0.82.0)
-  - llama-rn (0.12.2):
+  - llama-rn (0.12.3):
     - boost
     - DoubleConversion
     - fast_float
@@ -3026,7 +3026,7 @@ SPEC CHECKSUMS:
   fmt: bf3b0f2427f5c78a3d39ac34a7dbe72faabf986d
   glog: 5683914934d5b6e4240e497e0f4a3b42d1854183
   hermes-engine: 8642d8f14a548ab718ec112e9bebdfdd154138b5
-  llama-rn: 7c747bc3524474d621a96e5c2e13996958d175d7
+  llama-rn: 3572e98802d66df7bac065705dac4c8b4928d200
   RCT-Folly: 846fda9475e61ec7bcbf8a3fe81edfcaeb090669
   RCTDeprecation: 22bf66112da540a7d40e536366ddd8557934fca1
   RCTRequired: a0ed4dc41b35f79fbb6d8ba320e06882a8c792cf
@@ -3097,7 +3097,7 @@ SPEC CHECKSUMS:
   ReactAppDependencyProvider: c5c4f5280e4ae0f9f4a739c64c4260fe0b3edaf1
   ReactCodegen: 374f1c9242fbdd673b460d358b33860c0cc9d926
   ReactCommon: 25c7f94aee74ddd93a8287756a8ac0830a309544
-  RNAudioAPI: 8f309254a527a858541a692c2ef2db606ad44c14
+  RNAudioAPI: 8a9d346fac228321993ac9016b6a6c70fa9cf9fb
   RNCAsyncStorage: 29f0230e1a25f36c20b05f65e2eb8958d6526e82
   RNCClipboard: f538e2ba34c187a6597c2f17c4faa4e1cafae97c
   RNGestureHandler: f1dd7f92a0faa2868a919ab53bb9d66eb4ebfcf5
@@ -3107,4 +3107,4 @@ SPEC CHECKSUMS:
 
 PODFILE CHECKSUM: 9ae6aa21acbbf582851316ed853d11bbc9718098
 
-COCOAPODS: 1.15.2
+COCOAPODS: 1.16.2
diff --git a/src/version.ts b/src/version.ts
index 8296e69f..6ea85b6d 100644
--- a/src/version.ts
+++ b/src/version.ts
@@ -1,2 +1,2 @@
-export const BUILD_NUMBER = '9254'
-export const BUILD_COMMIT = 'e947228'
+export const BUILD_NUMBER = '9297'
+export const BUILD_COMMIT = 'b0df4c0'
diff --git a/third_party/llama.cpp b/third_party/llama.cpp
index e9472282..b0df4c0c 160000
--- a/third_party/llama.cpp
+++ b/third_party/llama.cpp
@@ -1 +1 @@
-Subproject commit e947228222147356bc7e64154d3439e142481632
+Subproject commit b0df4c0cfd2cda10738056771714a5290dc95454