diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 42d559dfecf..1ef3262afb0 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -11847,7 +11847,7 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
-@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
+@ModelBase.register("HunYuanDenseV1ForCausalLM")
 class HunYuanModel(TextModel):
     model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
 
@@ -12020,6 +12020,84 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
         return super().tensor_force_quant(name, new_name, bid, n_dims)
 
 
+@ModelBase.register("HunYuanVLForConditionalGeneration")
+class HunyuanVLVisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        # Compute image_size from max_image_size if not explicitly set
+        if "image_size" not in self.hparams_vision:
+            self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Skip text-model tensors (they go into the LLM gguf file)
+        if name.startswith("model."):
+            return
+
+        if name.startswith("vit."):
+            if "position_embedding" in name:
+                data_torch = data_torch[1:]  # [16385, n_embd] -> [16384, n_embd]
+            yield from super().modify_tensors(data_torch, name, bid)
+            return
+
+        # Fallback for any remaining tensors
+        yield from super().modify_tensors(data_torch, name, bid)
+
+    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int):
+        # Keep the final linear projection (mm.mlp.weight) in F16 to preserve precision
+        if new_name == "mm.mlp.weight":
+            return gguf.GGMLQuantizationType.F16
+        if ("mm.proj." in new_name) and new_name.endswith(".weight"):
+            return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        assert self.hparams_vision is not None
+        hparams = self.hparams_vision
+
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
+        self.gguf_writer.add_vision_use_gelu(True)
+
+        if (rms_norm_eps := hparams.get("rms_norm_eps")) is not None:
+            self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
+        if (merge_size := hparams.get("spatial_merge_size")) is not None:
+            self.gguf_writer.add_vision_spatial_merge_size(int(merge_size))
+
+
+@ModelBase.register("HunYuanVLForConditionalGeneration")
+class HunyuanVLTextModel(HunYuanModel):
+    model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        if self.rope_parameters.get("rope_type") == "xdrope":
+            alpha = float(self.rope_parameters.get("alpha", 50))
+            base  = float(self.rope_parameters.get("rope_theta", 10000.0))
+
+            # Write raw values; C++ computes: freq_base = base * alpha^(dim/(dim-2))
+            self.gguf_writer.add_rope_freq_base(base)
+            self.gguf_writer.add_rope_scaling_alpha(alpha)
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+            self.gguf_writer.add_rope_scaling_factor(1)
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024)
+            self.gguf_writer.add_context_length(256 * 1024)
+
+            # xdrope_section defines which head-dim slices use each positional axis
+            # Reuse the M-RoPE rope_dimension_sections mechanism
+            xdrope_section = list(self.rope_parameters.get("xdrope_section", []))
+            while len(xdrope_section) < 4:
+                xdrope_section.append(0)
+            self.gguf_writer.add_rope_dimension_sections(xdrope_section[:4])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Skip vision tensors — they are written by HunyuanVLVisionModel
+        if name.startswith("vit."):
+            return
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("SmolLM3ForCausalLM")
 class SmolLM3Model(LlamaModel):
     model_arch = gguf.MODEL_ARCH.SMOLLM3
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 703e3783136..e6cac1cd5c9 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2222,6 +2222,7 @@ extern "C" {
     enum ggml_scale_flag {
         GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8),
         GGML_SCALE_FLAG_ANTIALIAS     = (1 << 9),
+        GGML_SCALE_FLAG_CUSTOM_SF     = (1 << 10), // use explicit scale factors stored in op_params[1], op_params[2]
     };
 
     // interpolate
@@ -2255,6 +2256,22 @@ extern "C" {
             int64_t               ne3,
             uint32_t              mode); // ggml_scale_mode [ | ggml_scale_flag...]
 
+    // Like ggml_interpolate but with explicit scale factors sf0 and sf1 for the first two
+    // dimensions instead of deriving them from ne0/ne1 / src.ne0/src.ne1.
+    // Useful when the desired coordinate mapping differs from the simple ratio
+    // (e.g. PyTorch scale_factor=(H+0.1)/n_grid instead of H/n_grid).
+    // Sets GGML_SCALE_FLAG_CUSTOM_SF internally; sf0 corresponds to dim0, sf1 to dim1.
+    GGML_API struct ggml_tensor * ggml_interpolate_sf(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            uint32_t              mode, // ggml_scale_mode [ | ggml_scale_flag...]
+            float                 sf0,
+            float                 sf1);
+
     // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
     GGML_API struct ggml_tensor * ggml_pad(
             struct ggml_context * ctx,
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index a9bc21da6f0..09be07b8922 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7551,6 +7551,11 @@ static void ggml_compute_forward_upscale_f32(
     const int32_t mode_flags = ggml_get_op_params_i32(dst, 0);
     const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
 
+    if (mode_flags & GGML_SCALE_FLAG_CUSTOM_SF) {
+        sf0 = ggml_get_op_params_f32(dst, 1);
+        sf1 = ggml_get_op_params_f32(dst, 2);
+    }
+
     if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
         pixel_offset = 0.0f;
         sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
@@ -7559,13 +7564,13 @@ static void ggml_compute_forward_upscale_f32(
 
     if (mode == GGML_SCALE_MODE_NEAREST) {
         for (int64_t i3 = 0; i3 < ne3; i3++) {
-            const int64_t i03 = i3 / sf3;
+            const int64_t i03 = MIN((int64_t)(i3 / sf3), ne03 - 1);
             for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-                const int64_t i02 = i2 / sf2;
+                const int64_t i02 = MIN((int64_t)(i2 / sf2), ne02 - 1);
                 for (int64_t i1 = 0; i1 < ne1; i1++) {
-                    const int64_t i01 = i1 / sf1;
+                    const int64_t i01 = MIN((int64_t)(i1 / sf1), ne01 - 1);
                     for (int64_t i0 = 0; i0 < ne0; i0++) {
-                        const int64_t i00 = i0 / sf0;
+                        const int64_t i00 = MIN((int64_t)(i0 / sf0), ne00 - 1);
 
                         const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                               float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index eda041f4518..a13fda54832 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5024,6 +5024,25 @@ struct ggml_tensor * ggml_interpolate(
     return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
 }
 
+struct ggml_tensor * ggml_interpolate_sf(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        int64_t               ne3,
+        uint32_t              mode,
+        float                 sf0,
+        float                 sf1) {
+    GGML_ASSERT(sf0 > 0.0f && "ggml_interpolate_sf: sf0 must be positive");
+    GGML_ASSERT(sf1 > 0.0f && "ggml_interpolate_sf: sf1 must be positive");
+    struct ggml_tensor * result = ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3,
+                                                        mode | GGML_SCALE_FLAG_CUSTOM_SF);
+    ggml_set_op_params_f32(result, 1, sf0);
+    ggml_set_op_params_f32(result, 2, sf1);
+    return result;
+}
+
 // ggml_pad
 
 struct ggml_tensor * ggml_pad(
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index c5297a2f440..4bee58c7e57 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -197,6 +197,7 @@ class Rope:
         FREQ_BASE_SWA             = "{arch}.rope.freq_base_swa"
         SCALING_TYPE              = "{arch}.rope.scaling.type"
         SCALING_FACTOR            = "{arch}.rope.scaling.factor"
+        SCALING_ALPHA             = "{arch}.rope.scaling.alpha"
         SCALING_ATTN_FACTOR       = "{arch}.rope.scaling.attn_factor"
         SCALING_ORIG_CTX_LEN      = "{arch}.rope.scaling.original_context_length"
         SCALING_FINETUNED         = "{arch}.rope.scaling.finetuned"
@@ -471,6 +472,7 @@ class MODEL_ARCH(IntEnum):
     ERNIE4_5_MOE     = auto()
     HUNYUAN_MOE      = auto()
     HUNYUAN_DENSE    = auto()
+    HUNYUAN_VL       = auto()
     SMOLLM3          = auto()
     GPT_OSS          = auto()
     LFM2             = auto()
@@ -957,6 +959,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.FALCON_H1:        "falcon-h1",
     MODEL_ARCH.HUNYUAN_MOE:      "hunyuan-moe",
     MODEL_ARCH.HUNYUAN_DENSE:    "hunyuan-dense",
+    MODEL_ARCH.HUNYUAN_VL:       "hunyuan_vl",
     MODEL_ARCH.SMOLLM3:          "smollm3",
     MODEL_ARCH.GPT_OSS:          "gpt-oss",
     MODEL_ARCH.LFM2:             "lfm2",
@@ -3489,6 +3492,22 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.HUNYUAN_VL: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.SMOLLM3: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -4138,6 +4157,7 @@ class VisionProjectorType:
     YOUTUVL = "youtuvl"
     NEMOTRON_V2_VL = "nemotron_v2_vl"
     HUNYUANOCR     = "hunyuanocr"
+    HUNYUANVL      = "hunyuanvl_merger"
 
 
 # Items here are (block size, type size)
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 90d500dc771..6a81ca37d8c 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -973,6 +973,9 @@ def add_rope_scaling_type(self, value: RopeScalingType) -> None:
     def add_rope_scaling_factor(self, value: float) -> None:
         self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
 
+    def add_rope_scaling_alpha(self, value: float) -> None:
+        self.add_float32(Keys.Rope.SCALING_ALPHA.format(arch=self.arch), value)
+
     def add_rope_scaling_attn_factors(self, value: float) -> None:
         self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
 
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 6904b9c1a64..633a66fc665 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -109,6 +109,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_ERNIE4_5_MOE,     "ernie4_5-moe"     },
     { LLM_ARCH_HUNYUAN_MOE,      "hunyuan-moe"      },
     { LLM_ARCH_HUNYUAN_DENSE,    "hunyuan-dense"    },
+    { LLM_ARCH_HUNYUAN_VL,       "hunyuan_vl"       },
     { LLM_ARCH_SMOLLM3,          "smollm3"          },
     { LLM_ARCH_OPENAI_MOE,       "gpt-oss"          },
     { LLM_ARCH_LFM2,             "lfm2"             },
@@ -250,6 +251,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ROPE_SCALE_LINEAR,              "%s.rope.scale_linear"                    },
     { LLM_KV_ROPE_SCALING_TYPE,              "%s.rope.scaling.type"                    },
     { LLM_KV_ROPE_SCALING_FACTOR,            "%s.rope.scaling.factor"                  },
+    { LLM_KV_ROPE_SCALING_ALPHA,             "%s.rope.scaling.alpha"                   },
     { LLM_KV_ROPE_SCALING_ATTN_FACTOR,       "%s.rope.scaling.attn_factor"             },
     { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,      "%s.rope.scaling.original_context_length" },
     { LLM_KV_ROPE_SCALING_FINETUNED,         "%s.rope.scaling.finetuned"               },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index c4aabab7e0c..8f335f5c7b3 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -113,6 +113,7 @@ enum llm_arch {
     LLM_ARCH_ERNIE4_5_MOE,
     LLM_ARCH_HUNYUAN_MOE,
     LLM_ARCH_HUNYUAN_DENSE,
+    LLM_ARCH_HUNYUAN_VL,
     LLM_ARCH_SMOLLM3,
     LLM_ARCH_OPENAI_MOE,
     LLM_ARCH_LFM2,
@@ -254,6 +255,7 @@ enum llm_kv {
     LLM_KV_ROPE_SCALE_LINEAR,
     LLM_KV_ROPE_SCALING_TYPE,
     LLM_KV_ROPE_SCALING_FACTOR,
+    LLM_KV_ROPE_SCALING_ALPHA,
     LLM_KV_ROPE_SCALING_ATTN_FACTOR,
     LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
     LLM_KV_ROPE_SCALING_FINETUNED,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 002d15d415f..c50a647b426 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -256,3 +256,7 @@ uint32_t llama_hparams::n_layer_kv() const {
 bool llama_hparams::use_mrope() const {
     return rope_sections[0] > 0 && rope_sections[1] > 0;
 }
+
+bool llama_hparams::use_xdrope() const {
+    return rope_sections[0] > 0 && rope_sections[1] > 0 && rope_sections[2] > 0 && rope_sections[3] > 0;
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index c2000c77c37..27141310221 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -116,6 +116,7 @@ struct llama_hparams {
     float    rope_freq_base_train_swa  = 10000.0f;
     float    rope_freq_scale_train;
     float    rope_freq_scale_train_swa = 1.0f;
+    float    rope_scaling_alpha        = 0.0f;  // NTK-aware alpha for XDRoPE
 
     uint32_t n_ctx_orig_yarn;
     float    rope_yarn_log_mul = 0.0f;
@@ -351,6 +352,8 @@ struct llama_hparams {
 
 
     bool use_mrope() const;
+
+    bool use_xdrope() const;
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index edbaf52a2f8..e8c272d2d14 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -722,6 +722,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,      hparams.n_expert_groups, false);
     ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used,    false);
 
+    if (arch == LLM_ARCH_HUNYUAN_VL || arch == LLM_ARCH_HUNYUAN_DENSE) {
+        if (hparams.n_expert <= 1) {
+            hparams.n_expert      = 0;
+            hparams.n_expert_used = 0;
+        }
+    }
+
     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
         ml.get_key(LLM_KV_FEATURES_LENGTH,  hparams.n_embd);
         ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl);
@@ -800,6 +807,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
 
     ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
+    ml.get_key(LLM_KV_ROPE_SCALING_ALPHA,       hparams.rope_scaling_alpha, false);
 
     // non-transformer models do not have attention heads
     if (hparams.n_head() > 0) {
@@ -2575,9 +2583,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_HUNYUAN_VL:
         case LLM_ARCH_HUNYUAN_DENSE:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+                // XDRoPE / NTK-aware scaling: base = rope_theta * alpha^(dim / (dim - 2))
+                if (hparams.rope_scaling_alpha > 0.0f) {
+                    const int dim = hparams.n_embd_head_k();
+                    hparams.rope_freq_base_train = hparams.rope_freq_base_train
+                        * powf(hparams.rope_scaling_alpha, (float)dim / (float)(dim - 2));
+                }
+
+                if (hparams.n_expert <= 1) {
+                    hparams.n_expert      = 0;
+                    hparams.n_expert_used = 0;
+                }
 
                 switch (hparams.n_embd) {
                     case 1024: type = LLM_TYPE_0_5B; break;
@@ -6938,6 +6960,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
                     }
                 } break;
+            case LLM_ARCH_HUNYUAN_VL:
             case LLM_ARCH_HUNYUAN_DENSE:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -8958,6 +8981,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
             } break;
+        case LLM_ARCH_HUNYUAN_VL:
         case LLM_ARCH_HUNYUAN_DENSE:
             {
                 llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
@@ -9307,6 +9331,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GLM4_MOE:
             return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
 
+        case LLM_ARCH_HUNYUAN_VL:
+            return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
+
         // all model arches should be listed explicitly here
         case LLM_ARCH_UNKNOWN:
             GGML_ABORT("unknown architecture");
diff --git a/src/models/hunyuan-dense.cpp b/src/models/hunyuan-dense.cpp
index 97f5da8ee90..fc954a35067 100644
--- a/src/models/hunyuan-dense.cpp
+++ b/src/models/hunyuan-dense.cpp
@@ -6,6 +6,11 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
     GGML_ASSERT(n_embd_head == n_rot);
 
+    const bool use_xdrope = hparams.use_xdrope();
+
+    int sections[4];
+    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
@@ -37,22 +42,36 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
             auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
                     n_embd_head, n_head, n_head_kv, il);
 
-            Qcur = ggml_rope_ext(
-                        ctx0, Qcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow
-                        );
+            if (use_xdrope) {
+                Qcur = ggml_rope_multi(
+                            ctx0, Qcur, inp_pos, rope_factors,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                            );
+
+                Kcur = ggml_rope_multi(
+                            ctx0, Kcur, inp_pos, rope_factors,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                            );
+            } else {
+                Qcur = ggml_rope_ext(
+                            ctx0, Qcur, inp_pos, rope_factors,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                            );
+
+                Kcur = ggml_rope_ext(
+                            ctx0, Kcur, inp_pos, rope_factors,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                            );
+            }
 
             cb(Qcur, "Qcur", il);
             cb(Kcur, "Kcur", il);
             cb(Vcur, "Vcur", il);
 
-            Kcur = ggml_rope_ext(
-                        ctx0, Kcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow
-                        );
-
             Kcur = build_norm(Kcur,
                         model.layers[il].attn_k_norm, nullptr,
                         LLM_NORM_RMS, il);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 828a9c14a45..d82f3abe128 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -5778,6 +5778,40 @@ struct test_interpolate : public test_case {
     }
 };
 
+// GGML_OP_UPSCALE (via ggml_interpolate_sf) - custom scale factors
+struct test_interpolate_sf : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> ne_tgt;
+    const ggml_scale_mode mode;
+    const float sf0;
+    const float sf1;
+
+    std::string vars() override {
+        return VARS_TO_STR6(type, ne, ne_tgt, mode, sf0, sf1);
+    }
+
+    test_interpolate_sf(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne     = {2, 5, 7, 11},
+            std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13},
+            ggml_scale_mode mode = GGML_SCALE_MODE_BILINEAR,
+            float sf0 = 1.0f,
+            float sf1 = 1.0f)
+        : type(type), ne(ne), ne_tgt(ne_tgt), mode(mode), sf0(sf0), sf1(sf1) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_interpolate_sf(ctx, a,
+            ne_tgt[0], ne_tgt[1], ne_tgt[2], ne_tgt[3],
+            mode, sf0, sf1);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
 // GGML_OP_GROUP_NORM
 struct test_group_norm : public test_case {
     const ggml_type type;
@@ -8443,6 +8477,27 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {4, 1, 3, 2}, {1, 1, 3, 2}, (ggml_scale_mode)(mode | GGML_SCALE_FLAG_ALIGN_CORNERS)));
     }
 
+    // ggml_interpolate_sf - custom scale factors 
+    for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC, ggml_scale_mode(GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)}) {
+        test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {2, 5,  7, 11}, {5, 7, 11, 13}, mode, 5.0f/2.0f, 7.0f/5.0f));
+        test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {5, 7, 11, 13}, {2, 5,  7, 11}, mode, 2.0f/5.0f, 5.0f/7.0f));
+    }
+    for (ggml_scale_mode mode : {GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC}) {
+        test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, (ggml_scale_mode)(mode | GGML_SCALE_FLAG_ALIGN_CORNERS), 5.0f/2.0f, 7.0f/5.0f));
+        test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {1, 4, 3, 2}, {2, 8, 3, 2}, (ggml_scale_mode)(mode | GGML_SCALE_FLAG_ALIGN_CORNERS), 2.0f/1.0f, 8.0f/4.0f));
+        test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {4, 1, 3, 2}, {1, 1, 3, 2}, (ggml_scale_mode)(mode | GGML_SCALE_FLAG_ALIGN_CORNERS), 1.0f/4.0f, 1.0f/1.0f));
+    }
+    for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC, ggml_scale_mode(GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)}) {
+        test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {14, 14, 1152, 1}, {28, 28, 1152, 1}, mode, (28.0f+0.1f)/14.0f, (28.0f+0.1f)/14.0f));
+        test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {14, 14, 768, 1},  {20, 16,  768, 1}, mode, (20.0f+0.1f)/14.0f, (16.0f+0.1f)/14.0f));
+        test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {8, 8, 64, 1}, {8, 8, 64, 1}, mode, 1.0f, 1.0f));
+        test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {28, 28, 768, 1}, {14, 14, 768, 1}, mode, (14.0f+0.1f)/28.0f, (14.0f+0.1f)/28.0f));
+    }
+    for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC, ggml_scale_mode(GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)}) {
+        test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {10, 10, 64, 1}, {20, 20, 64, 1}, mode, 1.5f, 1.5f));
+        test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {10, 10, 64, 1}, {20, 16, 64, 1}, mode, 1.5f, 1.2f));
+    }
+
     test_cases.emplace_back(new test_sum());
     test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 2, 1, 3}));  // row-contiguous but non-contiguous
     test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 3, 2, 1}));
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 17cb703f7fb..b94deadd47a 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -150,7 +150,7 @@
 #define TN_TOK_BOI         "v.boi"
 #define TN_TOK_EOI         "v.eoi"
 
-// hunyuanocr
+// hunyuanocr / hunyuanvl (shared GGUF tensor names)
 #define TN_MM_PRE_NORM     "mm.pre_norm.%s"
 #define TN_TOK_IMG_BEGIN   "mm.image_begin"
 #define TN_TOK_IMG_END     "mm.image_end"
@@ -293,6 +293,7 @@ enum projector_type {
     PROJECTOR_TYPE_KIMIK25,
     PROJECTOR_TYPE_NEMOTRON_V2_VL,
     PROJECTOR_TYPE_HUNYUANOCR,
+    PROJECTOR_TYPE_HUNYUANVL,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
@@ -338,6 +339,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_KIMIK25,   "kimik25"},
     { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
     { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
+    { PROJECTOR_TYPE_HUNYUANVL,  "hunyuanvl_merger"},
 };
 
 static projector_type clip_projector_type_from_string(const std::string & str) {
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index f0e8786b660..616f590ab21 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -912,6 +912,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
             } break;
         case PROJECTOR_TYPE_HUNYUANOCR:
+        case PROJECTOR_TYPE_HUNYUANVL:
             {
                 builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
             } break;
@@ -1459,6 +1460,16 @@ struct clip_model_loader {
                         get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
                         hparams.set_warmup_n_tokens(28*28);
                     } break;
+                case PROJECTOR_TYPE_HUNYUANVL:
+                    {
+                        hparams.n_merge = 2;
+                        hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
+                        hparams.image_resize_pad = false;
+                        hparams.ffn_op = FFN_GELU;
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        hparams.set_limit_image_tokens(256, 16384);
+                        hparams.set_warmup_n_tokens(32*32);
+                    } break;
                 case PROJECTOR_TYPE_LFM2A:
                     {
                         // audio preprocessing params
@@ -2159,6 +2170,7 @@ struct clip_model_loader {
                     model.mm_eoi            = get_tensor(TN_TOK_EOI);
                 } break;
             case PROJECTOR_TYPE_HUNYUANOCR:
+            case PROJECTOR_TYPE_HUNYUANVL:
                 {
                     // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
                     model.mm_0_w            = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
@@ -2797,6 +2809,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
         case PROJECTOR_TYPE_GLM4V:
         case PROJECTOR_TYPE_PADDLEOCR:
         case PROJECTOR_TYPE_HUNYUANOCR:
+        case PROJECTOR_TYPE_HUNYUANVL:
         case PROJECTOR_TYPE_YOUTUVL:
             return (img->nx / params.patch_size) / 2;
         case PROJECTOR_TYPE_STEP3VL:
@@ -2816,6 +2829,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
         case PROJECTOR_TYPE_QWEN3VL:
         case PROJECTOR_TYPE_GLM4V:
         case PROJECTOR_TYPE_PADDLEOCR:
+        case PROJECTOR_TYPE_HUNYUANVL:
         case PROJECTOR_TYPE_YOUTUVL:
             return (img->ny / params.patch_size) / 2;
         case PROJECTOR_TYPE_STEP3VL:
@@ -3003,6 +3017,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
             n_patches = h * (h + 1) + 1;
         } break;
         case PROJECTOR_TYPE_HUNYUANOCR:
+        case PROJECTOR_TYPE_HUNYUANVL:
             {
                 int merge = ctx->model.hparams.n_merge;
                 int ow = (img->nx / patch_size) / merge;
@@ -3463,6 +3478,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         case PROJECTOR_TYPE_PHI4:
         case PROJECTOR_TYPE_COGVLM:
         case PROJECTOR_TYPE_HUNYUANOCR:
+        case PROJECTOR_TYPE_HUNYUANVL:
             {
                 // do nothing
             } break;
@@ -3691,6 +3707,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         case PROJECTOR_TYPE_KIMIK25:
             return ctx->model.mm_2_w->ne[1];
         case PROJECTOR_TYPE_HUNYUANOCR:
+        case PROJECTOR_TYPE_HUNYUANVL:
             return ctx->model.mm_model_proj->ne[1];
         case PROJECTOR_TYPE_COGVLM:
             return ctx->model.mm_4h_to_h_w->ne[1];
diff --git a/tools/mtmd/models/hunyuanocr.cpp b/tools/mtmd/models/hunyuanocr.cpp
index 37d1e2b86a9..f13ad45d88c 100644
--- a/tools/mtmd/models/hunyuanocr.cpp
+++ b/tools/mtmd/models/hunyuanocr.cpp
@@ -1,11 +1,35 @@
 #include "models.h"
+#include <cmath>
 
 ggml_cgraph * clip_graph_hunyuanocr::build() {
     const int merge = hparams.n_merge;
     const int pw    = n_patches_x;
     const int ph    = n_patches_y;
 
-    ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
+    // Position embedding interpolation.
+    // HunyuanVL uses explicit scale factors (target+0.1)/n_grid to match Python's behavior.
+    // HunyuanOCR uses the same square layout and the standard ratio-based interpolation.
+    ggml_tensor * pos_embd = nullptr;
+    if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) {
+        const int64_t n_pos  = model.position_embeddings->ne[1]; // n_grid * n_grid
+        const int     n_grid = (int)std::round(std::sqrt((double)n_pos));
+        ggml_tensor * pos_patch = model.position_embeddings;
+        if (ph == n_grid && pw == n_grid) {
+            pos_embd = pos_patch; // no interpolation needed
+        } else {
+            pos_patch = ggml_reshape_3d(ctx0, pos_patch, n_embd, n_grid, n_grid);
+            pos_patch = ggml_permute(ctx0, pos_patch, 2, 0, 1, 3);
+            pos_patch = ggml_cont(ctx0, pos_patch);
+            pos_patch = ggml_interpolate_sf(ctx0, pos_patch, pw, ph, n_embd, 1,
+                                            GGML_SCALE_MODE_BILINEAR,
+                                            (float)(pw + 0.1f) / n_grid,
+                                            (float)(ph + 0.1f) / n_grid);
+            pos_patch = ggml_permute(ctx0, pos_patch, 1, 2, 0, 3);
+            pos_embd  = ggml_cont_2d(ctx0, pos_patch, n_embd, ph * pw);
+        }
+    } else {
+        pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
+    }
 
     ggml_tensor * inp = build_inp();
     ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 145b88cea44..90ccb545952 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -180,6 +180,30 @@ struct decode_embd_batch {
         }
     }
 
+    void set_position_xdrope_2d(llama_pos pos_0, const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
+        GGML_ASSERT(n_pos_per_embd == 4);
+        GGML_ASSERT(!rel_pos.empty() && (int32_t)rel_pos.size() == batch.n_tokens);
+        seq_id_0[0] = seq_id;
+        for (int32_t i = 0; i < batch.n_tokens; i++) {
+            if (i == 0 || i == batch.n_tokens - 1) {
+                pos[i                     ] = pos_0 + rel_pos[i].t;
+                pos[i + batch.n_tokens    ] = pos_0 + rel_pos[i].x;
+                pos[i + batch.n_tokens * 2] = pos_0 + rel_pos[i].y;
+                pos[i + batch.n_tokens * 3] = pos_0 + rel_pos[i].z;
+            } else {
+                pos[i                     ] = pos_0 + rel_pos[i].t;
+                pos[i + batch.n_tokens    ] = rel_pos[i].x;
+                pos[i + batch.n_tokens * 2] = rel_pos[i].y;
+                pos[i + batch.n_tokens * 3] = rel_pos[i].z;
+            }
+        }
+        for (int j = 0; j < batch.n_tokens; j++) {
+            batch.n_seq_id[j] = 1;
+            batch.seq_id  [j] = seq_id_0.data();
+            batch.logits  [j] = false;
+        }
+    }
+
     // M-RoPE for audio
     void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
         GGML_ASSERT(n_pos_per_embd == 4);
@@ -252,7 +276,7 @@ int32_t mtmd_helper_decode_image_chunk(
 
     const llama_model * model = llama_get_model(lctx);
     int n_mmproj_embd = llama_model_n_embd_inp(model);
-    int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
+    int n_pos_per_embd = (mtmd_decode_use_mrope(ctx) || mtmd_decode_use_xdrope(ctx)) ? 4 : 1;
 
     int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
     int32_t i_batch = 0;
@@ -275,6 +299,20 @@ int32_t mtmd_helper_decode_image_chunk(
         } else {
             GGML_ABORT("invalid chunk type for M-RoPE");
         }
+    } else if (mtmd_decode_use_xdrope(ctx)) {
+        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+            if (!image_tokens) {
+                LOG_ERR("failed to decode chunk: image tokens are null\n");
+                return -1;
+            }
+            const auto n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
+            std::vector<mtmd_decoder_pos> rel_pos(n_tokens);
+            mtmd_helper_image_get_decoder_pos(image_tokens, rel_pos.data());
+            batch_embd.set_position_xdrope_2d(n_past, rel_pos, seq_id);
+        } else {
+            GGML_ABORT("invalid chunk type for M-RoPE");
+        }
     } else {
         batch_embd.set_position_normal(n_past, seq_id);
     }
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index d0a0a4865ef..c736d7c5506 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -37,7 +37,12 @@ struct mtmd_image_tokens {
     uint32_t nx; // number of tokens in x direction
     uint32_t ny; // number of tokens in y direction
     bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
-    uint32_t n_tokens() const { return nx * ny; }
+    bool use_xdrope_pos = false; // use XD-RoPE position counting
+    uint32_t n_boi = 0; // number of BOI tokens, for xdrope
+    uint32_t n_eoi = 0; // number of EOI tokens, for xdrope
+    uint32_t n_newline = 0; // number of image newline tokens, for xdrope
+    uint32_t image_idx = 0; // image index, for xdrope
+    uint32_t n_tokens() const { return nx * ny + n_newline + n_boi + n_eoi; }
     clip_image_f32_batch batch_f32; // preprocessed image patches
     std::string id; // optional user-defined ID, useful for KV cache tracking
 
@@ -46,6 +51,11 @@ struct mtmd_image_tokens {
             nx,
             ny,
             use_mrope_pos,
+            use_xdrope_pos,
+            n_boi,
+            n_eoi,
+            n_newline,
+            image_idx,
             batch_f32.clone(),
             id
         };
@@ -430,6 +440,7 @@ struct mtmd_context {
                     image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                 } break;
             case PROJECTOR_TYPE_HUNYUANOCR:
+            case PROJECTOR_TYPE_HUNYUANVL:
                 {
                     // note: these use fullwidth ｜ (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
                     img_beg = "<｜hy_place▁holder▁no▁100｜>";
@@ -778,6 +789,16 @@ struct mtmd_tokenizer {
                     image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
                     image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
                     image_tokens->use_mrope_pos = true;
+                } else if(mtmd_decode_use_xdrope(ctx)) {
+                    // (e.g. HunyuanVL adds row newlines + BOI/EOI),
+                    // HunyuanVL: 1 BOI + ny rows × (nx tokens + 1 newline) + 1 EOI
+                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
+                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
+                    image_tokens->n_boi = 1;
+                    image_tokens->n_eoi = 1;
+                    image_tokens->n_newline = image_tokens->ny;
+                    image_tokens->image_idx = 0;
+                    image_tokens->use_xdrope_pos = true;
                 } else {
                     // other models, we only need the total number of tokens
                     image_tokens->nx = n_tokens;
@@ -1045,6 +1066,15 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
     }
 }
 
+bool mtmd_decode_use_xdrope(mtmd_context * ctx) {
+    switch (ctx->proj_type_v()) {
+        case PROJECTOR_TYPE_HUNYUANVL:
+            return true;
+        default:
+            return false;
+    }
+}
+
 bool mtmd_support_vision(mtmd_context * ctx) {
     return ctx->ctx_v != nullptr;
 }
@@ -1248,9 +1278,39 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
 
 mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i) {
     mtmd_decoder_pos pos;
-    pos.t = 0;
-    pos.x = i % image_tokens->nx;
-    pos.y = i / image_tokens->nx;
+    if (image_tokens->use_xdrope_pos == true) {
+        // HunyuanVL: BOI + rows with newlines + EOI
+        const uint32_t nx = image_tokens->nx;
+        const uint32_t n_total = image_tokens->n_tokens();
+        const uint32_t image_idx = image_tokens->image_idx;
+        // Layout: [BOI] [token(0,0)...token(nx-1,0)] [newline(0)] ... [token(0,ny-1)...token(nx-1,ny-1)] [newline(ny-1)] [EOI]
+        // n_total = 2 + ny * (nx + 1)
+        if (i == 0) {
+            // BOI token - all 4 dims = sequential index
+            pos.t = i; pos.x = i; pos.y = i; pos.z = i;
+        } else if (i == n_total - 1) {
+            // EOI token - all 4 dims = sequential index
+            pos.t = i; pos.x = i; pos.y = i; pos.z = i;
+        } else {
+            // content token or newline
+            uint32_t offset = (uint32_t)i - 1;
+            uint32_t row = offset / (nx + 1);
+            uint32_t col = offset % (nx + 1);
+            if (col < nx) {
+                // regular token at (row, col)
+                pos.t = i; pos.x = col; pos.y = row; pos.z = image_idx;
+            } else {
+                // newline token at end of row
+                pos.t = i; pos.x = nx; pos.y = row; pos.z = image_idx;
+            }
+        }
+    } else {
+        // standard 2D grid (Qwen2VL, etc.)
+        pos.t = 0;
+        pos.x = i % image_tokens->nx;
+        pos.y = i / image_tokens->nx;
+        pos.z = 0;
+    }
     return pos;
 }
 
@@ -1264,6 +1324,11 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
         // t is omitted as we don't support video input
         return std::max(image_tokens->nx, image_tokens->ny);
     }
+    if (image_tokens->use_xdrope_pos) {
+        // HunyuanVL: the sequential (dim-0) position advances by the full token count
+        // (includes BOI/EOI and row newline tokens), not by max(nx, ny)
+        return image_tokens->n_tokens();
+    }
     return image_tokens->n_tokens();
 }
 
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index a6fd8efa5d0..dbba9d59278 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -117,6 +117,10 @@ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx, const mtmd_input_ch
 // whether the current model use M-RoPE for llama_decode
 MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
 
+// whether the current model uses XD-RoPE: HunyuanVL-style M-RoPE
+// (token layout differs from standard 2D grid: BOI + rows-with-newlines + EOI)
+MTMD_API bool mtmd_decode_use_xdrope(mtmd_context * ctx);
+
 // whether the current model supports vision input
 MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
 
@@ -196,10 +200,11 @@ struct mtmd_decoder_pos {
     uint32_t t;
     uint32_t x;
     uint32_t y;
+    uint32_t z;
 };
 // get position for decoder attention, to be used by M-RoPE models
 // i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1
-// return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)
+// return relative position (for example, embedding 0 will have position (0, 0, 0, 0); remember to adjust it to the current absolute position)
 MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i);
 
 // tokenize an input text prompt and a list of bitmaps (images/audio)
diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh
index 5da48d61bfd..83416fb272b 100755
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@@ -91,6 +91,7 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
 add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr
 add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR"
 add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR"
+add_test_vision "ggml-org/HunyuanVL-4B-GGUF:Q8_0"
 add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja
 
 add_test_audio  "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"