ggml-org · ManaEstras · Apr 17, 2026 · Apr 18, 2026 · Apr 18, 2026 · Apr 18, 2026
@@ -11847,7 +11847,7 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
-@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
+@ModelBase.register("HunYuanDenseV1ForCausalLM")
 class HunYuanModel(TextModel):
     model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
 
@@ -12020,6 +12020,84 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
         return super().tensor_force_quant(name, new_name, bid, n_dims)
 
 
+@ModelBase.register("HunYuanVLForConditionalGeneration")
+class HunyuanVLVisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        # Compute image_size from max_image_size if not explicitly set
+        if "image_size" not in self.hparams_vision:
+            self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Skip text-model tensors (they go into the LLM gguf file)
+        if name.startswith("model."):
+            return
+
+        if name.startswith("vit."):
+            if "position_embedding" in name:
+                data_torch = data_torch[1:]  # [16385, n_embd] -> [16384, n_embd]
+            yield from super().modify_tensors(data_torch, name, bid)
+            return
+
+        # Fallback for any remaining tensors
+        yield from super().modify_tensors(data_torch, name, bid)
+
+    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int):
+        # Keep the final linear projection (mm.mlp.weight) in F16 to preserve precision
+        if new_name == "mm.mlp.weight":
+            return gguf.GGMLQuantizationType.F16
+        if ("mm.proj." in new_name) and new_name.endswith(".weight"):
+            return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        assert self.hparams_vision is not None
+        hparams = self.hparams_vision
+
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
+        self.gguf_writer.add_vision_use_gelu(True)
+
+        if (rms_norm_eps := hparams.get("rms_norm_eps")) is not None:
+            self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
+        if (merge_size := hparams.get("spatial_merge_size")) is not None:
+            self.gguf_writer.add_vision_spatial_merge_size(int(merge_size))
+
+
+@ModelBase.register("HunYuanVLForConditionalGeneration")
+class HunyuanVLTextModel(HunYuanModel):
+    model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        if self.rope_parameters.get("rope_type") == "xdrope":
+            alpha = float(self.rope_parameters.get("alpha", 50))
+            base  = float(self.rope_parameters.get("rope_theta", 10000.0))
+
+            # Write raw values; C++ computes: freq_base = base * alpha^(dim/(dim-2))
+            self.gguf_writer.add_rope_freq_base(base)
+            self.gguf_writer.add_rope_scaling_alpha(alpha)
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+            self.gguf_writer.add_rope_scaling_factor(1)
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024)
+            self.gguf_writer.add_context_length(256 * 1024)
+
+            # xdrope_section defines which head-dim slices use each positional axis
+            # Reuse the M-RoPE rope_dimension_sections mechanism
+            xdrope_section = list(self.rope_parameters.get("xdrope_section", []))
+            while len(xdrope_section) < 4:
+                xdrope_section.append(0)
+            self.gguf_writer.add_rope_dimension_sections(xdrope_section[:4])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Skip vision tensors — they are written by HunyuanVLVisionModel
+        if name.startswith("vit."):
+            return
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("SmolLM3ForCausalLM")
 class SmolLM3Model(LlamaModel):
     model_arch = gguf.MODEL_ARCH.SMOLLM3

@@ -2222,6 +2222,7 @@ extern "C" {
     enum ggml_scale_flag {
         GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8),
         GGML_SCALE_FLAG_ANTIALIAS     = (1 << 9),
+        GGML_SCALE_FLAG_CUSTOM_SF     = (1 << 10), // use explicit scale factors stored in op_params[1], op_params[2]
     };
 
     // interpolate
@@ -2255,6 +2256,22 @@ extern "C" {
             int64_t               ne3,
             uint32_t              mode); // ggml_scale_mode [ | ggml_scale_flag...]
 
+    // Like ggml_interpolate but with explicit scale factors sf0 and sf1 for the first two
+    // dimensions instead of deriving them from ne0/ne1 / src.ne0/src.ne1.
+    // Useful when the desired coordinate mapping differs from the simple ratio
+    // (e.g. PyTorch scale_factor=(H+0.1)/n_grid instead of H/n_grid).
+    // Sets GGML_SCALE_FLAG_CUSTOM_SF internally; sf0 corresponds to dim0, sf1 to dim1.
+    GGML_API struct ggml_tensor * ggml_interpolate_sf(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            uint32_t              mode, // ggml_scale_mode [ | ggml_scale_flag...]
+            float                 sf0,
+            float                 sf1);
+
     // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
     GGML_API struct ggml_tensor * ggml_pad(
             struct ggml_context * ctx,

@@ -7551,6 +7551,11 @@ static void ggml_compute_forward_upscale_f32(
     const int32_t mode_flags = ggml_get_op_params_i32(dst, 0);
     const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
 
+    if (mode_flags & GGML_SCALE_FLAG_CUSTOM_SF) {
+        sf0 = ggml_get_op_params_f32(dst, 1);
+        sf1 = ggml_get_op_params_f32(dst, 2);
+    }
+
     if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
         pixel_offset = 0.0f;
         sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
@@ -7559,13 +7564,13 @@ static void ggml_compute_forward_upscale_f32(
 
     if (mode == GGML_SCALE_MODE_NEAREST) {
         for (int64_t i3 = 0; i3 < ne3; i3++) {
-            const int64_t i03 = i3 / sf3;
+            const int64_t i03 = MIN((int64_t)(i3 / sf3), ne03 - 1);
             for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-                const int64_t i02 = i2 / sf2;
+                const int64_t i02 = MIN((int64_t)(i2 / sf2), ne02 - 1);
                 for (int64_t i1 = 0; i1 < ne1; i1++) {
-                    const int64_t i01 = i1 / sf1;
+                    const int64_t i01 = MIN((int64_t)(i1 / sf1), ne01 - 1);
                     for (int64_t i0 = 0; i0 < ne0; i0++) {
-                        const int64_t i00 = i0 / sf0;
+                        const int64_t i00 = MIN((int64_t)(i0 / sf0), ne00 - 1);
 
                         const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                               float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);

@@ -5024,6 +5024,25 @@ struct ggml_tensor * ggml_interpolate(
     return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
 }
 
+struct ggml_tensor * ggml_interpolate_sf(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        int64_t               ne3,
+        uint32_t              mode,
+        float                 sf0,
+        float                 sf1) {
+    GGML_ASSERT(sf0 > 0.0f && "ggml_interpolate_sf: sf0 must be positive");
+    GGML_ASSERT(sf1 > 0.0f && "ggml_interpolate_sf: sf1 must be positive");
+    struct ggml_tensor * result = ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3,
+                                                        mode | GGML_SCALE_FLAG_CUSTOM_SF);
+    ggml_set_op_params_f32(result, 1, sf0);
+    ggml_set_op_params_f32(result, 2, sf1);
+    return result;
+}
+
 // ggml_pad
 
 struct ggml_tensor * ggml_pad(

@@ -197,6 +197,7 @@ class Rope:
         FREQ_BASE_SWA             = "{arch}.rope.freq_base_swa"
         SCALING_TYPE              = "{arch}.rope.scaling.type"
         SCALING_FACTOR            = "{arch}.rope.scaling.factor"
+        SCALING_ALPHA             = "{arch}.rope.scaling.alpha"
         SCALING_ATTN_FACTOR       = "{arch}.rope.scaling.attn_factor"
         SCALING_ORIG_CTX_LEN      = "{arch}.rope.scaling.original_context_length"
         SCALING_FINETUNED         = "{arch}.rope.scaling.finetuned"
@@ -471,6 +472,7 @@ class MODEL_ARCH(IntEnum):
     ERNIE4_5_MOE     = auto()
     HUNYUAN_MOE      = auto()
     HUNYUAN_DENSE    = auto()
+    HUNYUAN_VL       = auto()
     SMOLLM3          = auto()
     GPT_OSS          = auto()
     LFM2             = auto()
@@ -957,6 +959,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.FALCON_H1:        "falcon-h1",
     MODEL_ARCH.HUNYUAN_MOE:      "hunyuan-moe",
     MODEL_ARCH.HUNYUAN_DENSE:    "hunyuan-dense",
+    MODEL_ARCH.HUNYUAN_VL:       "hunyuan_vl",
     MODEL_ARCH.SMOLLM3:          "smollm3",
     MODEL_ARCH.GPT_OSS:          "gpt-oss",
     MODEL_ARCH.LFM2:             "lfm2",
@@ -3489,6 +3492,22 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.HUNYUAN_VL: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.SMOLLM3: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -4138,6 +4157,7 @@ class VisionProjectorType:
     YOUTUVL = "youtuvl"
     NEMOTRON_V2_VL = "nemotron_v2_vl"
     HUNYUANOCR     = "hunyuanocr"
+    HUNYUANVL      = "hunyuanvl_merger"
 
 
 # Items here are (block size, type size)

@@ -973,6 +973,9 @@ def add_rope_scaling_type(self, value: RopeScalingType) -> None:
     def add_rope_scaling_factor(self, value: float) -> None:
         self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
 
+    def add_rope_scaling_alpha(self, value: float) -> None:
+        self.add_float32(Keys.Rope.SCALING_ALPHA.format(arch=self.arch), value)
+
     def add_rope_scaling_attn_factors(self, value: float) -> None:
         self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
 

@@ -109,6 +109,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_ERNIE4_5_MOE,     "ernie4_5-moe"     },
     { LLM_ARCH_HUNYUAN_MOE,      "hunyuan-moe"      },
     { LLM_ARCH_HUNYUAN_DENSE,    "hunyuan-dense"    },
+    { LLM_ARCH_HUNYUAN_VL,       "hunyuan_vl"       },
     { LLM_ARCH_SMOLLM3,          "smollm3"          },
     { LLM_ARCH_OPENAI_MOE,       "gpt-oss"          },
     { LLM_ARCH_LFM2,             "lfm2"             },
@@ -250,6 +251,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ROPE_SCALE_LINEAR,              "%s.rope.scale_linear"                    },
     { LLM_KV_ROPE_SCALING_TYPE,              "%s.rope.scaling.type"                    },
     { LLM_KV_ROPE_SCALING_FACTOR,            "%s.rope.scaling.factor"                  },
+    { LLM_KV_ROPE_SCALING_ALPHA,             "%s.rope.scaling.alpha"                   },
     { LLM_KV_ROPE_SCALING_ATTN_FACTOR,       "%s.rope.scaling.attn_factor"             },
     { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,      "%s.rope.scaling.original_context_length" },
     { LLM_KV_ROPE_SCALING_FINETUNED,         "%s.rope.scaling.finetuned"               },

@@ -113,6 +113,7 @@ enum llm_arch {
     LLM_ARCH_ERNIE4_5_MOE,
     LLM_ARCH_HUNYUAN_MOE,
     LLM_ARCH_HUNYUAN_DENSE,
+    LLM_ARCH_HUNYUAN_VL,
     LLM_ARCH_SMOLLM3,
     LLM_ARCH_OPENAI_MOE,
     LLM_ARCH_LFM2,
@@ -254,6 +255,7 @@ enum llm_kv {
     LLM_KV_ROPE_SCALE_LINEAR,
     LLM_KV_ROPE_SCALING_TYPE,
     LLM_KV_ROPE_SCALING_FACTOR,
+    LLM_KV_ROPE_SCALING_ALPHA,
     LLM_KV_ROPE_SCALING_ATTN_FACTOR,
     LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
     LLM_KV_ROPE_SCALING_FINETUNED,

@@ -256,3 +256,7 @@ uint32_t llama_hparams::n_layer_kv() const {
 bool llama_hparams::use_mrope() const {
     return rope_sections[0] > 0 && rope_sections[1] > 0;
 }
+
+bool llama_hparams::use_xdrope() const {
+    return rope_sections[0] > 0 && rope_sections[1] > 0 && rope_sections[2] > 0 && rope_sections[3] > 0;
+}
@@ -116,6 +116,7 @@ struct llama_hparams {
     float    rope_freq_base_train_swa  = 10000.0f;
     float    rope_freq_scale_train;
     float    rope_freq_scale_train_swa = 1.0f;
+    float    rope_scaling_alpha        = 0.0f;  // NTK-aware alpha for XDRoPE
 
     uint32_t n_ctx_orig_yarn;
     float    rope_yarn_log_mul = 0.0f;
@@ -351,6 +352,8 @@ struct llama_hparams {
 
 
     bool use_mrope() const;
+
+    bool use_xdrope() const;
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -722,6 +722,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,      hparams.n_expert_groups, false);
     ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used,    false);
 
+    if (arch == LLM_ARCH_HUNYUAN_VL || arch == LLM_ARCH_HUNYUAN_DENSE) {
+        if (hparams.n_expert <= 1) {
+            hparams.n_expert      = 0;
+            hparams.n_expert_used = 0;
+        }
+    }
+
     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
         ml.get_key(LLM_KV_FEATURES_LENGTH,  hparams.n_embd);
         ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl);
@@ -800,6 +807,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
 
     ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
+    ml.get_key(LLM_KV_ROPE_SCALING_ALPHA,       hparams.rope_scaling_alpha, false);
 
     // non-transformer models do not have attention heads
     if (hparams.n_head() > 0) {
@@ -2575,9 +2583,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_HUNYUAN_VL:
         case LLM_ARCH_HUNYUAN_DENSE:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+                // XDRoPE / NTK-aware scaling: base = rope_theta * alpha^(dim / (dim - 2))
+                if (hparams.rope_scaling_alpha > 0.0f) {
+                    const int dim = hparams.n_embd_head_k();
+                    hparams.rope_freq_base_train = hparams.rope_freq_base_train
+                        * powf(hparams.rope_scaling_alpha, (float)dim / (float)(dim - 2));
+                }
+
+                if (hparams.n_expert <= 1) {
+                    hparams.n_expert      = 0;
+                    hparams.n_expert_used = 0;
+                }
 
                 switch (hparams.n_embd) {
                     case 1024: type = LLM_TYPE_0_5B; break;
@@ -6938,6 +6960,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
                     }
                 } break;
+            case LLM_ARCH_HUNYUAN_VL:
             case LLM_ARCH_HUNYUAN_DENSE:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -8958,6 +8981,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
             } break;
+        case LLM_ARCH_HUNYUAN_VL:
         case LLM_ARCH_HUNYUAN_DENSE:
             {
                 llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
@@ -9307,6 +9331,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GLM4_MOE:
             return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
 
+        case LLM_ARCH_HUNYUAN_VL:
+            return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
+
         // all model arches should be listed explicitly here
         case LLM_ARCH_UNKNOWN:
             GGML_ABORT("unknown architecture");