diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 42d559dfecf..1ef3262afb0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -11847,7 +11847,7 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration") +@ModelBase.register("HunYuanDenseV1ForCausalLM") class HunYuanModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE @@ -12020,6 +12020,84 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) +@ModelBase.register("HunYuanVLForConditionalGeneration") +class HunyuanVLVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + # Compute image_size from max_image_size if not explicitly set + if "image_size" not in self.hparams_vision: + self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Skip text-model tensors (they go into the LLM gguf file) + if name.startswith("model."): + return + + if name.startswith("vit."): + if "position_embedding" in name: + data_torch = data_torch[1:] # [16385, n_embd] -> [16384, n_embd] + yield from super().modify_tensors(data_torch, name, bid) + return + + # Fallback for any remaining tensors + yield from super().modify_tensors(data_torch, name, bid) + + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int): + # Keep the final linear projection (mm.mlp.weight) in F16 to preserve precision + if new_name == "mm.mlp.weight": + return gguf.GGMLQuantizationType.F16 + if ("mm.proj." in new_name) and new_name.endswith(".weight"): + return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + hparams = self.hparams_vision + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL) + self.gguf_writer.add_vision_use_gelu(True) + + if (rms_norm_eps := hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) + if (merge_size := hparams.get("spatial_merge_size")) is not None: + self.gguf_writer.add_vision_spatial_merge_size(int(merge_size)) + + +@ModelBase.register("HunYuanVLForConditionalGeneration") +class HunyuanVLTextModel(HunYuanModel): + model_arch = gguf.MODEL_ARCH.HUNYUAN_VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + if self.rope_parameters.get("rope_type") == "xdrope": + alpha = float(self.rope_parameters.get("alpha", 50)) + base = float(self.rope_parameters.get("rope_theta", 10000.0)) + + # Write raw values; C++ computes: freq_base = base * alpha^(dim/(dim-2)) + self.gguf_writer.add_rope_freq_base(base) + self.gguf_writer.add_rope_scaling_alpha(alpha) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_rope_scaling_factor(1) + self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) + self.gguf_writer.add_context_length(256 * 1024) + + # xdrope_section defines which head-dim slices use each positional axis + # Reuse the M-RoPE rope_dimension_sections mechanism + xdrope_section = list(self.rope_parameters.get("xdrope_section", [])) + while len(xdrope_section) < 4: + xdrope_section.append(0) + self.gguf_writer.add_rope_dimension_sections(xdrope_section[:4]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Skip vision tensors — they are written by HunyuanVLVisionModel + if name.startswith("vit."): + return + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("SmolLM3ForCausalLM") class SmolLM3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.SMOLLM3 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 703e3783136..e6cac1cd5c9 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2222,6 +2222,7 @@ extern "C" { enum ggml_scale_flag { GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8), GGML_SCALE_FLAG_ANTIALIAS = (1 << 9), + GGML_SCALE_FLAG_CUSTOM_SF = (1 << 10), // use explicit scale factors stored in op_params[1], op_params[2] }; // interpolate @@ -2255,6 +2256,22 @@ extern "C" { int64_t ne3, uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...] + // Like ggml_interpolate but with explicit scale factors sf0 and sf1 for the first two + // dimensions instead of deriving them from ne0/ne1 / src.ne0/src.ne1. + // Useful when the desired coordinate mapping differs from the simple ratio + // (e.g. PyTorch scale_factor=(H+0.1)/n_grid instead of H/n_grid). + // Sets GGML_SCALE_FLAG_CUSTOM_SF internally; sf0 corresponds to dim0, sf1 to dim1. + GGML_API struct ggml_tensor * ggml_interpolate_sf( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + uint32_t mode, // ggml_scale_mode [ | ggml_scale_flag...] + float sf0, + float sf1); + // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0] GGML_API struct ggml_tensor * ggml_pad( struct ggml_context * ctx, diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index a9bc21da6f0..09be07b8922 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7551,6 +7551,11 @@ static void ggml_compute_forward_upscale_f32( const int32_t mode_flags = ggml_get_op_params_i32(dst, 0); const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF); + if (mode_flags & GGML_SCALE_FLAG_CUSTOM_SF) { + sf0 = ggml_get_op_params_f32(dst, 1); + sf1 = ggml_get_op_params_f32(dst, 2); + } + if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) { pixel_offset = 0.0f; sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0; @@ -7559,13 +7564,13 @@ static void ggml_compute_forward_upscale_f32( if (mode == GGML_SCALE_MODE_NEAREST) { for (int64_t i3 = 0; i3 < ne3; i3++) { - const int64_t i03 = i3 / sf3; + const int64_t i03 = MIN((int64_t)(i3 / sf3), ne03 - 1); for (int64_t i2 = ith; i2 < ne2; i2 += nth) { - const int64_t i02 = i2 / sf2; + const int64_t i02 = MIN((int64_t)(i2 / sf2), ne02 - 1); for (int64_t i1 = 0; i1 < ne1; i1++) { - const int64_t i01 = i1 / sf1; + const int64_t i01 = MIN((int64_t)(i1 / sf1), ne01 - 1); for (int64_t i0 = 0; i0 < ne0; i0++) { - const int64_t i00 = i0 / sf0; + const int64_t i00 = MIN((int64_t)(i0 / sf0), ne00 - 1); const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index eda041f4518..a13fda54832 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5024,6 +5024,25 @@ struct ggml_tensor * ggml_interpolate( return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode); } +struct ggml_tensor * ggml_interpolate_sf( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + uint32_t mode, + float sf0, + float sf1) { + GGML_ASSERT(sf0 > 0.0f && "ggml_interpolate_sf: sf0 must be positive"); + GGML_ASSERT(sf1 > 0.0f && "ggml_interpolate_sf: sf1 must be positive"); + struct ggml_tensor * result = ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, + mode | GGML_SCALE_FLAG_CUSTOM_SF); + ggml_set_op_params_f32(result, 1, sf0); + ggml_set_op_params_f32(result, 2, sf1); + return result; +} + // ggml_pad struct ggml_tensor * ggml_pad( diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c5297a2f440..4bee58c7e57 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -197,6 +197,7 @@ class Rope: FREQ_BASE_SWA = "{arch}.rope.freq_base_swa" SCALING_TYPE = "{arch}.rope.scaling.type" SCALING_FACTOR = "{arch}.rope.scaling.factor" + SCALING_ALPHA = "{arch}.rope.scaling.alpha" SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" @@ -471,6 +472,7 @@ class MODEL_ARCH(IntEnum): ERNIE4_5_MOE = auto() HUNYUAN_MOE = auto() HUNYUAN_DENSE = auto() + HUNYUAN_VL = auto() SMOLLM3 = auto() GPT_OSS = auto() LFM2 = auto() @@ -957,6 +959,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.FALCON_H1: "falcon-h1", MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe", MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense", + MODEL_ARCH.HUNYUAN_VL: "hunyuan_vl", MODEL_ARCH.SMOLLM3: "smollm3", MODEL_ARCH.GPT_OSS: "gpt-oss", MODEL_ARCH.LFM2: "lfm2", @@ -3489,6 +3492,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.HUNYUAN_VL: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.SMOLLM3: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -4138,6 +4157,7 @@ class VisionProjectorType: YOUTUVL = "youtuvl" NEMOTRON_V2_VL = "nemotron_v2_vl" HUNYUANOCR = "hunyuanocr" + HUNYUANVL = "hunyuanvl_merger" # Items here are (block size, type size) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 90d500dc771..6a81ca37d8c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -973,6 +973,9 @@ def add_rope_scaling_type(self, value: RopeScalingType) -> None: def add_rope_scaling_factor(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value) + def add_rope_scaling_alpha(self, value: float) -> None: + self.add_float32(Keys.Rope.SCALING_ALPHA.format(arch=self.arch), value) + def add_rope_scaling_attn_factors(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 6904b9c1a64..633a66fc665 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -109,6 +109,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" }, { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" }, { LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" }, + { LLM_ARCH_HUNYUAN_VL, "hunyuan_vl" }, { LLM_ARCH_SMOLLM3, "smollm3" }, { LLM_ARCH_OPENAI_MOE, "gpt-oss" }, { LLM_ARCH_LFM2, "lfm2" }, @@ -250,6 +251,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ALPHA, "%s.rope.scaling.alpha" }, { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index c4aabab7e0c..8f335f5c7b3 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -113,6 +113,7 @@ enum llm_arch { LLM_ARCH_ERNIE4_5_MOE, LLM_ARCH_HUNYUAN_MOE, LLM_ARCH_HUNYUAN_DENSE, + LLM_ARCH_HUNYUAN_VL, LLM_ARCH_SMOLLM3, LLM_ARCH_OPENAI_MOE, LLM_ARCH_LFM2, @@ -254,6 +255,7 @@ enum llm_kv { LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALING_TYPE, LLM_KV_ROPE_SCALING_FACTOR, + LLM_KV_ROPE_SCALING_ALPHA, LLM_KV_ROPE_SCALING_ATTN_FACTOR, LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_FINETUNED, diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 002d15d415f..c50a647b426 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -256,3 +256,7 @@ uint32_t llama_hparams::n_layer_kv() const { bool llama_hparams::use_mrope() const { return rope_sections[0] > 0 && rope_sections[1] > 0; } + +bool llama_hparams::use_xdrope() const { + return rope_sections[0] > 0 && rope_sections[1] > 0 && rope_sections[2] > 0 && rope_sections[3] > 0; +} diff --git a/src/llama-hparams.h b/src/llama-hparams.h index c2000c77c37..27141310221 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -116,6 +116,7 @@ struct llama_hparams { float rope_freq_base_train_swa = 10000.0f; float rope_freq_scale_train; float rope_freq_scale_train_swa = 1.0f; + float rope_scaling_alpha = 0.0f; // NTK-aware alpha for XDRoPE uint32_t n_ctx_orig_yarn; float rope_yarn_log_mul = 0.0f; @@ -351,6 +352,8 @@ struct llama_hparams { bool use_mrope() const; + + bool use_xdrope() const; }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index edbaf52a2f8..e8c272d2d14 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -722,6 +722,13 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false); + if (arch == LLM_ARCH_HUNYUAN_VL || arch == LLM_ARCH_HUNYUAN_DENSE) { + if (hparams.n_expert <= 1) { + hparams.n_expert = 0; + hparams.n_expert_used = 0; + } + } + if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd); ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl); @@ -800,6 +807,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false); + ml.get_key(LLM_KV_ROPE_SCALING_ALPHA, hparams.rope_scaling_alpha, false); // non-transformer models do not have attention heads if (hparams.n_head() > 0) { @@ -2575,9 +2583,23 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_HUNYUAN_VL: case LLM_ARCH_HUNYUAN_DENSE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); + + // XDRoPE / NTK-aware scaling: base = rope_theta * alpha^(dim / (dim - 2)) + if (hparams.rope_scaling_alpha > 0.0f) { + const int dim = hparams.n_embd_head_k(); + hparams.rope_freq_base_train = hparams.rope_freq_base_train + * powf(hparams.rope_scaling_alpha, (float)dim / (float)(dim - 2)); + } + + if (hparams.n_expert <= 1) { + hparams.n_expert = 0; + hparams.n_expert_used = 0; + } switch (hparams.n_embd) { case 1024: type = LLM_TYPE_0_5B; break; @@ -6938,6 +6960,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0); } } break; + case LLM_ARCH_HUNYUAN_VL: case LLM_ARCH_HUNYUAN_DENSE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -8958,6 +8981,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_HUNYUAN_VL: case LLM_ARCH_HUNYUAN_DENSE: { llm = std::make_unique(*this, params); @@ -9307,6 +9331,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GLM4_MOE: return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_HUNYUAN_VL: + return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX; + // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: GGML_ABORT("unknown architecture"); diff --git a/src/models/hunyuan-dense.cpp b/src/models/hunyuan-dense.cpp index 97f5da8ee90..fc954a35067 100644 --- a/src/models/hunyuan-dense.cpp +++ b/src/models/hunyuan-dense.cpp @@ -6,6 +6,11 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); GGML_ASSERT(n_embd_head == n_rot); + const bool use_xdrope = hparams.use_xdrope(); + + int sections[4]; + std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); + ggml_tensor * cur; ggml_tensor * inpL; @@ -37,22 +42,36 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, n_embd_head, n_head, n_head_kv, il); - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); + if (use_xdrope) { + Qcur = ggml_rope_multi( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_multi( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + } else { + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 828a9c14a45..d82f3abe128 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -5778,6 +5778,40 @@ struct test_interpolate : public test_case { } }; +// GGML_OP_UPSCALE (via ggml_interpolate_sf) - custom scale factors +struct test_interpolate_sf : public test_case { + const ggml_type type; + const std::array ne; + const std::array ne_tgt; + const ggml_scale_mode mode; + const float sf0; + const float sf1; + + std::string vars() override { + return VARS_TO_STR6(type, ne, ne_tgt, mode, sf0, sf1); + } + + test_interpolate_sf(ggml_type type = GGML_TYPE_F32, + std::array ne = {2, 5, 7, 11}, + std::array ne_tgt = {5, 7, 11, 13}, + ggml_scale_mode mode = GGML_SCALE_MODE_BILINEAR, + float sf0 = 1.0f, + float sf1 = 1.0f) + : type(type), ne(ne), ne_tgt(ne_tgt), mode(mode), sf0(sf0), sf1(sf1) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_interpolate_sf(ctx, a, + ne_tgt[0], ne_tgt[1], ne_tgt[2], ne_tgt[3], + mode, sf0, sf1); + ggml_set_name(out, "out"); + + return out; + } +}; + // GGML_OP_GROUP_NORM struct test_group_norm : public test_case { const ggml_type type; @@ -8443,6 +8477,27 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {4, 1, 3, 2}, {1, 1, 3, 2}, (ggml_scale_mode)(mode | GGML_SCALE_FLAG_ALIGN_CORNERS))); } + // ggml_interpolate_sf - custom scale factors + for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC, ggml_scale_mode(GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)}) { + test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, mode, 5.0f/2.0f, 7.0f/5.0f)); + test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {5, 7, 11, 13}, {2, 5, 7, 11}, mode, 2.0f/5.0f, 5.0f/7.0f)); + } + for (ggml_scale_mode mode : {GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC}) { + test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, (ggml_scale_mode)(mode | GGML_SCALE_FLAG_ALIGN_CORNERS), 5.0f/2.0f, 7.0f/5.0f)); + test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {1, 4, 3, 2}, {2, 8, 3, 2}, (ggml_scale_mode)(mode | GGML_SCALE_FLAG_ALIGN_CORNERS), 2.0f/1.0f, 8.0f/4.0f)); + test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {4, 1, 3, 2}, {1, 1, 3, 2}, (ggml_scale_mode)(mode | GGML_SCALE_FLAG_ALIGN_CORNERS), 1.0f/4.0f, 1.0f/1.0f)); + } + for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC, ggml_scale_mode(GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)}) { + test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {14, 14, 1152, 1}, {28, 28, 1152, 1}, mode, (28.0f+0.1f)/14.0f, (28.0f+0.1f)/14.0f)); + test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {14, 14, 768, 1}, {20, 16, 768, 1}, mode, (20.0f+0.1f)/14.0f, (16.0f+0.1f)/14.0f)); + test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {8, 8, 64, 1}, {8, 8, 64, 1}, mode, 1.0f, 1.0f)); + test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {28, 28, 768, 1}, {14, 14, 768, 1}, mode, (14.0f+0.1f)/28.0f, (14.0f+0.1f)/28.0f)); + } + for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC, ggml_scale_mode(GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)}) { + test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {10, 10, 64, 1}, {20, 20, 64, 1}, mode, 1.5f, 1.5f)); + test_cases.emplace_back(new test_interpolate_sf(GGML_TYPE_F32, {10, 10, 64, 1}, {20, 16, 64, 1}, mode, 1.5f, 1.2f)); + } + test_cases.emplace_back(new test_sum()); test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 2, 1, 3})); // row-contiguous but non-contiguous test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 3, 2, 1})); diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 17cb703f7fb..b94deadd47a 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -150,7 +150,7 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" -// hunyuanocr +// hunyuanocr / hunyuanvl (shared GGUF tensor names) #define TN_MM_PRE_NORM "mm.pre_norm.%s" #define TN_TOK_IMG_BEGIN "mm.image_begin" #define TN_TOK_IMG_END "mm.image_end" @@ -293,6 +293,7 @@ enum projector_type { PROJECTOR_TYPE_KIMIK25, PROJECTOR_TYPE_NEMOTRON_V2_VL, PROJECTOR_TYPE_HUNYUANOCR, + PROJECTOR_TYPE_HUNYUANVL, PROJECTOR_TYPE_UNKNOWN, }; @@ -338,6 +339,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_KIMIK25, "kimik25"}, { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"}, + { PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl_merger"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f0e8786b660..616f590ab21 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -912,6 +912,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { builder = std::make_unique(ctx, img); } break; @@ -1459,6 +1460,16 @@ struct clip_model_loader { get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); hparams.set_warmup_n_tokens(28*28); } break; + case PROJECTOR_TYPE_HUNYUANVL: + { + hparams.n_merge = 2; + hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; + hparams.image_resize_pad = false; + hparams.ffn_op = FFN_GELU; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + hparams.set_limit_image_tokens(256, 16384); + hparams.set_warmup_n_tokens(32*32); + } break; case PROJECTOR_TYPE_LFM2A: { // audio preprocessing params @@ -2159,6 +2170,7 @@ struct clip_model_loader { model.mm_eoi = get_tensor(TN_TOK_EOI); } break; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear) model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); @@ -2797,6 +2809,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: return (img->nx / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: @@ -2816,6 +2829,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_PADDLEOCR: + case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: return (img->ny / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: @@ -3003,6 +3017,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches = h * (h + 1) + 1; } break; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { int merge = ctx->model.hparams.n_merge; int ow = (img->nx / patch_size) / merge; @@ -3463,6 +3478,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { // do nothing } break; @@ -3691,6 +3707,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_KIMIK25: return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; diff --git a/tools/mtmd/models/hunyuanocr.cpp b/tools/mtmd/models/hunyuanocr.cpp index 37d1e2b86a9..f13ad45d88c 100644 --- a/tools/mtmd/models/hunyuanocr.cpp +++ b/tools/mtmd/models/hunyuanocr.cpp @@ -1,11 +1,35 @@ #include "models.h" +#include ggml_cgraph * clip_graph_hunyuanocr::build() { const int merge = hparams.n_merge; const int pw = n_patches_x; const int ph = n_patches_y; - ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR); + // Position embedding interpolation. + // HunyuanVL uses explicit scale factors (target+0.1)/n_grid to match Python's behavior. + // HunyuanOCR uses the same square layout and the standard ratio-based interpolation. + ggml_tensor * pos_embd = nullptr; + if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) { + const int64_t n_pos = model.position_embeddings->ne[1]; // n_grid * n_grid + const int n_grid = (int)std::round(std::sqrt((double)n_pos)); + ggml_tensor * pos_patch = model.position_embeddings; + if (ph == n_grid && pw == n_grid) { + pos_embd = pos_patch; // no interpolation needed + } else { + pos_patch = ggml_reshape_3d(ctx0, pos_patch, n_embd, n_grid, n_grid); + pos_patch = ggml_permute(ctx0, pos_patch, 2, 0, 1, 3); + pos_patch = ggml_cont(ctx0, pos_patch); + pos_patch = ggml_interpolate_sf(ctx0, pos_patch, pw, ph, n_embd, 1, + GGML_SCALE_MODE_BILINEAR, + (float)(pw + 0.1f) / n_grid, + (float)(ph + 0.1f) / n_grid); + pos_patch = ggml_permute(ctx0, pos_patch, 1, 2, 0, 3); + pos_embd = ggml_cont_2d(ctx0, pos_patch, n_embd, ph * pw); + } + } else { + pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR); + } ggml_tensor * inp = build_inp(); ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr); diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 145b88cea44..90ccb545952 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -180,6 +180,30 @@ struct decode_embd_batch { } } + void set_position_xdrope_2d(llama_pos pos_0, const std::vector & rel_pos, llama_seq_id seq_id) { + GGML_ASSERT(n_pos_per_embd == 4); + GGML_ASSERT(!rel_pos.empty() && (int32_t)rel_pos.size() == batch.n_tokens); + seq_id_0[0] = seq_id; + for (int32_t i = 0; i < batch.n_tokens; i++) { + if (i == 0 || i == batch.n_tokens - 1) { + pos[i ] = pos_0 + rel_pos[i].t; + pos[i + batch.n_tokens ] = pos_0 + rel_pos[i].x; + pos[i + batch.n_tokens * 2] = pos_0 + rel_pos[i].y; + pos[i + batch.n_tokens * 3] = pos_0 + rel_pos[i].z; + } else { + pos[i ] = pos_0 + rel_pos[i].t; + pos[i + batch.n_tokens ] = rel_pos[i].x; + pos[i + batch.n_tokens * 2] = rel_pos[i].y; + pos[i + batch.n_tokens * 3] = rel_pos[i].z; + } + } + for (int j = 0; j < batch.n_tokens; j++) { + batch.n_seq_id[j] = 1; + batch.seq_id [j] = seq_id_0.data(); + batch.logits [j] = false; + } + } + // M-RoPE for audio void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) { GGML_ASSERT(n_pos_per_embd == 4); @@ -252,7 +276,7 @@ int32_t mtmd_helper_decode_image_chunk( const llama_model * model = llama_get_model(lctx); int n_mmproj_embd = llama_model_n_embd_inp(model); - int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1; + int n_pos_per_embd = (mtmd_decode_use_mrope(ctx) || mtmd_decode_use_xdrope(ctx)) ? 4 : 1; int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk); int32_t i_batch = 0; @@ -275,6 +299,20 @@ int32_t mtmd_helper_decode_image_chunk( } else { GGML_ABORT("invalid chunk type for M-RoPE"); } + } else if (mtmd_decode_use_xdrope(ctx)) { + if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk); + if (!image_tokens) { + LOG_ERR("failed to decode chunk: image tokens are null\n"); + return -1; + } + const auto n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens); + std::vector rel_pos(n_tokens); + mtmd_helper_image_get_decoder_pos(image_tokens, rel_pos.data()); + batch_embd.set_position_xdrope_2d(n_past, rel_pos, seq_id); + } else { + GGML_ABORT("invalid chunk type for M-RoPE"); + } } else { batch_embd.set_position_normal(n_past, seq_id); } diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index d0a0a4865ef..c736d7c5506 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -37,7 +37,12 @@ struct mtmd_image_tokens { uint32_t nx; // number of tokens in x direction uint32_t ny; // number of tokens in y direction bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position) - uint32_t n_tokens() const { return nx * ny; } + bool use_xdrope_pos = false; // use XD-RoPE position counting + uint32_t n_boi = 0; // number of BOI tokens, for xdrope + uint32_t n_eoi = 0; // number of EOI tokens, for xdrope + uint32_t n_newline = 0; // number of image newline tokens, for xdrope + uint32_t image_idx = 0; // image index, for xdrope + uint32_t n_tokens() const { return nx * ny + n_newline + n_boi + n_eoi; } clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking @@ -46,6 +51,11 @@ struct mtmd_image_tokens { nx, ny, use_mrope_pos, + use_xdrope_pos, + n_boi, + n_eoi, + n_newline, + image_idx, batch_f32.clone(), id }; @@ -430,6 +440,7 @@ struct mtmd_context { image_preproc = std::make_unique(ctx_v); } break; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { // note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary img_beg = "<|hy_place▁holder▁no▁100|>"; @@ -778,6 +789,16 @@ struct mtmd_tokenizer { image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get()); image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get()); image_tokens->use_mrope_pos = true; + } else if(mtmd_decode_use_xdrope(ctx)) { + // (e.g. HunyuanVL adds row newlines + BOI/EOI), + // HunyuanVL: 1 BOI + ny rows × (nx tokens + 1 newline) + 1 EOI + image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get()); + image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get()); + image_tokens->n_boi = 1; + image_tokens->n_eoi = 1; + image_tokens->n_newline = image_tokens->ny; + image_tokens->image_idx = 0; + image_tokens->use_xdrope_pos = true; } else { // other models, we only need the total number of tokens image_tokens->nx = n_tokens; @@ -1045,6 +1066,15 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) { } } +bool mtmd_decode_use_xdrope(mtmd_context * ctx) { + switch (ctx->proj_type_v()) { + case PROJECTOR_TYPE_HUNYUANVL: + return true; + default: + return false; + } +} + bool mtmd_support_vision(mtmd_context * ctx) { return ctx->ctx_v != nullptr; } @@ -1248,9 +1278,39 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) { mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i) { mtmd_decoder_pos pos; - pos.t = 0; - pos.x = i % image_tokens->nx; - pos.y = i / image_tokens->nx; + if (image_tokens->use_xdrope_pos == true) { + // HunyuanVL: BOI + rows with newlines + EOI + const uint32_t nx = image_tokens->nx; + const uint32_t n_total = image_tokens->n_tokens(); + const uint32_t image_idx = image_tokens->image_idx; + // Layout: [BOI] [token(0,0)...token(nx-1,0)] [newline(0)] ... [token(0,ny-1)...token(nx-1,ny-1)] [newline(ny-1)] [EOI] + // n_total = 2 + ny * (nx + 1) + if (i == 0) { + // BOI token - all 4 dims = sequential index + pos.t = i; pos.x = i; pos.y = i; pos.z = i; + } else if (i == n_total - 1) { + // EOI token - all 4 dims = sequential index + pos.t = i; pos.x = i; pos.y = i; pos.z = i; + } else { + // content token or newline + uint32_t offset = (uint32_t)i - 1; + uint32_t row = offset / (nx + 1); + uint32_t col = offset % (nx + 1); + if (col < nx) { + // regular token at (row, col) + pos.t = i; pos.x = col; pos.y = row; pos.z = image_idx; + } else { + // newline token at end of row + pos.t = i; pos.x = nx; pos.y = row; pos.z = image_idx; + } + } + } else { + // standard 2D grid (Qwen2VL, etc.) + pos.t = 0; + pos.x = i % image_tokens->nx; + pos.y = i / image_tokens->nx; + pos.z = 0; + } return pos; } @@ -1264,6 +1324,11 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { // t is omitted as we don't support video input return std::max(image_tokens->nx, image_tokens->ny); } + if (image_tokens->use_xdrope_pos) { + // HunyuanVL: the sequential (dim-0) position advances by the full token count + // (includes BOI/EOI and row newline tokens), not by max(nx, ny) + return image_tokens->n_tokens(); + } return image_tokens->n_tokens(); } diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index a6fd8efa5d0..dbba9d59278 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -117,6 +117,10 @@ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx, const mtmd_input_ch // whether the current model use M-RoPE for llama_decode MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx); +// whether the current model uses XD-RoPE: HunyuanVL-style M-RoPE +// (token layout differs from standard 2D grid: BOI + rows-with-newlines + EOI) +MTMD_API bool mtmd_decode_use_xdrope(mtmd_context * ctx); + // whether the current model supports vision input MTMD_API bool mtmd_support_vision(mtmd_context * ctx); @@ -196,10 +200,11 @@ struct mtmd_decoder_pos { uint32_t t; uint32_t x; uint32_t y; + uint32_t z; }; // get position for decoder attention, to be used by M-RoPE models // i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1 -// return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position) +// return relative position (for example, embedding 0 will have position (0, 0, 0, 0); remember to adjust it to the current absolute position) MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i); // tokenize an input text prompt and a list of bitmaps (images/audio) diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index 5da48d61bfd..83416fb272b 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -91,6 +91,7 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0" add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR" add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR" +add_test_vision "ggml-org/HunyuanVL-4B-GGUF:Q8_0" add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"