diff --git a/conversion/hunyuan.py b/conversion/hunyuan.py index be54f5810b0..537f023aa01 100644 --- a/conversion/hunyuan.py +++ b/conversion/hunyuan.py @@ -189,7 +189,8 @@ def set_vocab(self): self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - # HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab + # Some HunYuanVL variants (e.g. OCR-style configs) have pad_token_id=-1; + # guard SpecialVocab so it doesn't try to emit an invalid pad id. token_types = None if (self.hparams.get("pad_token_id") or 0) < 0: token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask') @@ -250,7 +251,8 @@ def set_vocab(self): self._fix_special_tokens() def set_gguf_parameters(self): - # HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it + # Some HunYuanVL variants set num_experts=1 (not real MoE); + # prevent the parent class from emitting expert_count metadata in that case. saved_num_experts = self.hparams.pop("num_experts", None) super().set_gguf_parameters() if saved_num_experts is not None and saved_num_experts > 1: @@ -288,51 +290,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("HunYuanVLForConditionalGeneration") class HunyuanVLVisionModel(MmprojModel): - # Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name - # "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout. - # Each variant maps to a different projector type in clip.cpp so image - # preprocessing follows the correct code path. - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) assert self.hparams_vision is not None - # HunyuanOCR / HunyuanVL uses max_image_size instead of image_size + # HunyuanVL uses max_image_size instead of image_size if "image_size" not in self.hparams_vision: self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048) - @staticmethod - def is_ocr_variant(hparams: dict) -> bool: - """Return True for HunyuanOCR, False for HunyuanVL. - - The projector's output dim must equal the text model's hidden_size by - construction (that's what "projector" means). HunyuanOCR pairs a 1B text - backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the - ViT -> LLM projection dim is a hard architectural signature, not a - magic number. - """ - vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) - return vision_out == 1024 - def set_gguf_parameters(self): super().set_gguf_parameters() assert self.hparams_vision is not None vcfg = self.hparams_vision - - if self.is_ocr_variant(self.global_config): - # --- HunyuanOCR --- - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR) - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5)) - self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2)) - self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) - self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) - return - - # --- HunyuanVL --- self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL) - self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu") - self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"])) - self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"])) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5)) + self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2)) self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"])) self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"])) @@ -353,7 +325,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter def tensor_force_quant(self, name, new_name, bid, n_dims): # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal - # Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2. + # HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2. if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"): return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 return super().tensor_force_quant(name, new_name, bid, n_dims) @@ -361,40 +333,18 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): @ModelBase.register("HunYuanVLForConditionalGeneration") class HunyuanVLTextModel(HunYuanModel): - # The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR - # and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE), - # while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from - # the config and pick the matching GGUF architecture. model_arch = gguf.MODEL_ARCH.HUNYUAN_VL - @staticmethod - def _is_ocr_config(hparams: dict) -> bool: - # OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that - # outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with - # HunyuanVLVisionModel.is_ocr_variant. - return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024 - def __init__(self, dir_model: Path, *args, **kwargs): - raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False) - if self._is_ocr_config(raw_hparams): - self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE - else: - self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL super().__init__(dir_model, *args, **kwargs) def set_gguf_parameters(self): super().set_gguf_parameters() - # Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses - # the HunYuan-Dense arch which already handles standard rope in super(). - if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL: - return - + # XD-RoPE metadata for the HunyuanVL; if self.rope_parameters.get("rope_type") != "xdrope": return - # defaults for HunyuanVL. The C++ side later computes: - # freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2)) self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"])) self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"])) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c25f217f990..7fdcf03d7d1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -747,7 +747,7 @@ class MODEL_TENSOR(IntEnum): V_LAYER_OUT_SCALE = auto() V_PRE_NORM = auto() V_POST_NORM = auto() - V_MM_PRE_NORM = auto() # hunyuanocr + V_MM_PRE_NORM = auto() # hunyuanvl V_MM_POST_NORM = auto() V_MM_INP_NORM = auto() V_MM_INP_PROJ = auto() # gemma3 @@ -791,8 +791,8 @@ class MODEL_TENSOR(IntEnum): V_MM_GATE = auto() # cogvlm V_TOK_BOI = auto() # cogvlm V_TOK_EOI = auto() # cogvlm - V_TOK_IMG_BEGIN = auto() # hunyuanocr - V_TOK_IMG_END = auto() # hunyuanocr + V_TOK_IMG_BEGIN = auto() # hunyuanvl + V_TOK_IMG_END = auto() # hunyuanvl V_STD_BIAS = auto() # gemma4 V_STD_SCALE = auto() # gemma4 V_SAM_POS_EMBD = auto() # Deepseek-OCR @@ -4273,7 +4273,6 @@ class VisionProjectorType: GLM4V = "glm4v" YOUTUVL = "youtuvl" NEMOTRON_V2_VL = "nemotron_v2_vl" - HUNYUANOCR = "hunyuanocr" HUNYUANVL = "hunyuanvl" MINICPMV4_6 = "minicpmv4_6" GRANITE_SPEECH = "granite_speech" # audio diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index f40cb828201..c2235cb3b61 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1366,7 +1366,7 @@ class TensorNameMap: "mlp_AR.linear_{bid}", # PaddleOCR-VL "merger.mlp.{bid}", "vision_tower.merger.mlp.{bid}", # dots.ocr - "vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2) + "vit.perceive.proj.{bid}", # HunyuanVL (proj.0 = conv1, proj.2 = conv2) ), MODEL_TENSOR.V_MMPROJ_FC: ( @@ -1374,7 +1374,7 @@ class TensorNameMap: "model.vision.linear_proj.linear_proj", # cogvlm "model.projector.layers", # Deepseek-OCR "visual.merger.proj", # glm4v - "vit.perceive.mlp", # HunyuanOCR + "vit.perceive.mlp", # HunyuanVL ), MODEL_TENSOR.V_MMPROJ_MLP: ( @@ -1403,7 +1403,7 @@ class TensorNameMap: "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 "vpm.embeddings.patch_embedding", "model.vision_model.embeddings.patch_embedding", # SmolVLM - "vit.embeddings.patch_embedding", # HunyuanOCR + "vit.embeddings.patch_embedding", # HunyuanVL "vision_tower.patch_conv", # pixtral-hf "vision_encoder.patch_conv", # pixtral "vision_model.patch_embedding.linear", # llama 4 @@ -1429,7 +1429,7 @@ class TensorNameMap: "model.vision_tower.embeddings.position_embeddings", # Intern-S1 "vpm.embeddings.position_embedding", "model.vision_model.embeddings.position_embedding", # SmolVLM - "vit.embeddings.position_embedding", # HunyuanOCR + "vit.embeddings.position_embedding", # HunyuanVL "vision_model.positional_embedding_vlm", # llama 4 "vision_tower.patch_embed.pos_emb", # kimi-vl "visual.pos_embed", # qwen3vl @@ -1442,12 +1442,12 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( "model.image_newline", # Deepseek-OCR - "vit.perceive.image_newline", # HunyuanOCR + "vit.perceive.image_newline", # HunyuanVL ), MODEL_TENSOR.V_ENC_EMBD_VSEP: ( "model.view_seperator", # Deepseek-OCR - "vit.perceive.image_sep", # HunyuanOCR + "vit.perceive.image_sep", # HunyuanVL ), MODEL_TENSOR.V_ENC_ATTN_QKV: ( @@ -1466,7 +1466,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.q_proj", "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM - "vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR + "vit.layers.{bid}.self_attn.q_proj", # HunyuanVL "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral @@ -1490,7 +1490,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.k_proj", "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM - "vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR + "vit.layers.{bid}.self_attn.k_proj", # HunyuanVL "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral @@ -1514,7 +1514,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM - "vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR + "vit.layers.{bid}.self_attn.v_proj", # HunyuanVL "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral @@ -1532,7 +1532,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm1", "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM - "vit.layers.{bid}.input_layernorm", # HunyuanOCR + "vit.layers.{bid}.input_layernorm", # HunyuanVL "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral "vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4 @@ -1553,7 +1553,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.out_proj", "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM - "vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR + "vit.layers.{bid}.self_attn.o_proj", # HunyuanVL "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf @@ -1580,7 +1580,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm2", "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM - "vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR + "vit.layers.{bid}.post_attention_layernorm", # HunyuanVL "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4 "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral @@ -1601,7 +1601,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc1", "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 - "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR + "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanVL "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral "vision_model.model.layers.{bid}.mlp.fc1", # llama4 @@ -1630,7 +1630,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc2", "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 - "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR + "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanVL "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral "vision_model.model.layers.{bid}.mlp.fc2", # llama4 @@ -1694,7 +1694,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_POST_NORM: ( "visual.merger.post_projection_norm", # glm4v "vision_tower.post_trunk_norm", # dots.ocr - "vit.perceive.after_rms", # HunyuanOCR + "vit.perceive.after_rms", # HunyuanVL ), MODEL_TENSOR.V_MM_INP_PROJ: ( @@ -1899,15 +1899,15 @@ class TensorNameMap: ), MODEL_TENSOR.V_MM_PRE_NORM: ( - "vit.perceive.before_rms", # HunyuanOCR + "vit.perceive.before_rms", # HunyuanVL ), MODEL_TENSOR.V_TOK_IMG_BEGIN: ( - "vit.perceive.image_begin", # HunyuanOCR + "vit.perceive.image_begin", # HunyuanVL ), MODEL_TENSOR.V_TOK_IMG_END: ( - "vit.perceive.image_end", # HunyuanOCR + "vit.perceive.image_end", # HunyuanVL ), MODEL_TENSOR.V_STD_BIAS: ( diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 6554a89b28a..f10397747b0 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -73,7 +73,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE }, { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE }, { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE }, - { "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR }, + { "hunyuan-vl", LLM_CHAT_TEMPLATE_HUNYUAN_VL }, { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 }, { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS }, { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 }, @@ -218,7 +218,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) { return LLM_CHAT_TEMPLATE_OPENAI_MOE; } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_begin▁of▁sentence|>")) { - return LLM_CHAT_TEMPLATE_HUNYUAN_OCR; + return LLM_CHAT_TEMPLATE_HUNYUAN_VL; } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) { return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE; } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) { @@ -825,8 +825,8 @@ int32_t llm_chat_apply_template( ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>"; } } - } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) { - // tencent/HunyuanOCR + } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_VL) { + // tencent/HunyuanOCR & tencent/HunyuanVL ss << "<|hy_begin▁of▁sentence|>"; for (size_t i = 0; i < chat.size(); i++) { std::string role(chat[i]->role); diff --git a/src/llama-chat.h b/src/llama-chat.h index 13f936a946c..ea6540c0be7 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -53,7 +53,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_HUNYUAN_MOE, LLM_CHAT_TEMPLATE_OPENAI_MOE, LLM_CHAT_TEMPLATE_HUNYUAN_DENSE, - LLM_CHAT_TEMPLATE_HUNYUAN_OCR, + LLM_CHAT_TEMPLATE_HUNYUAN_VL, LLM_CHAT_TEMPLATE_KIMI_K2, LLM_CHAT_TEMPLATE_SEED_OSS, LLM_CHAT_TEMPLATE_GROK_2, diff --git a/tools/cli/README.md b/tools/cli/README.md index c40b5a21cc0..629e641914b 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -172,8 +172,8 @@ | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) | diff --git a/tools/completion/README.md b/tools/completion/README.md index e5dd7f6f4e7..edb5da8068d 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -254,8 +254,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index a76adc9b80b..ffd30c7e6a1 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -22,7 +22,7 @@ add_library(mtmd models/gemma4v.cpp models/glm4v.cpp models/granite-speech.cpp - models/hunyuanocr.cpp + models/hunyuanvl.cpp models/internvl.cpp models/kimivl.cpp models/kimik25.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index c359851999f..ef4c342ba86 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -170,7 +170,7 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" -// hunyuanocr / hunyuanvl (shared GGUF tensor names) +// hunyuanvl (shared GGUF tensor names) #define TN_MM_PRE_NORM "mm.pre_norm.%s" #define TN_TOK_IMG_BEGIN "mm.image_begin" #define TN_TOK_IMG_END "mm.image_end" @@ -343,7 +343,6 @@ enum projector_type { PROJECTOR_TYPE_YASA2, PROJECTOR_TYPE_KIMIK25, PROJECTOR_TYPE_NEMOTRON_V2_VL, - PROJECTOR_TYPE_HUNYUANOCR, PROJECTOR_TYPE_HUNYUANVL, PROJECTOR_TYPE_MINICPMV4_6, PROJECTOR_TYPE_GRANITE_SPEECH, @@ -393,7 +392,6 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_YASA2, "yasa2"}, { PROJECTOR_TYPE_KIMIK25, "kimik25"}, { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, - { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"}, { PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"}, { PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"}, { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"}, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index ce15dbcd11e..da4563dbe80 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -510,7 +510,7 @@ struct clip_model { ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; - // hunyuanocr perceiver + // hunyuanvl perceiver ggml_tensor * mm_pre_norm_w = nullptr; ggml_tensor * mm_img_begin = nullptr; ggml_tensor * mm_img_end = nullptr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9727a738ed8..868c728d1f8 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -930,10 +930,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: { - builder = std::make_unique(ctx, img); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_MLP_NORM: @@ -1519,22 +1518,16 @@ struct clip_model_loader { get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true); get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); } break; - case PROJECTOR_TYPE_HUNYUANOCR: - { - hparams.n_merge = 2; - get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); - get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels); - get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); - hparams.set_warmup_n_tokens(28*28); - } break; case PROJECTOR_TYPE_HUNYUANVL: { hparams.n_merge = 2; hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; hparams.image_resize_pad = false; hparams.ffn_op = FFN_GELU; - get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); hparams.set_limit_image_tokens(256, 16384); + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels, false); + get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels, false); hparams.set_warmup_n_tokens(32*32); } break; case PROJECTOR_TYPE_LFM2A: @@ -2337,7 +2330,6 @@ struct clip_model_loader { model.mm_boi = get_tensor(TN_TOK_BOI); model.mm_eoi = get_tensor(TN_TOK_EOI); } break; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: { // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear) @@ -3062,7 +3054,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_MIMOVL: case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_PADDLEOCR: - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: return (img->nx / params.patch_size) / 2; @@ -3279,7 +3270,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im int h = static_cast(std::sqrt(static_cast(n_patches))); n_patches = h * (h + 1) + 1; } break; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: { int merge = ctx->model.hparams.n_merge; @@ -3915,7 +3905,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_COGVLM: - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_YASA2: { // do nothing @@ -3925,7 +3914,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // Compute the HunyuanVL 2D position embedding on CPU (with the // custom sf=(target+0.1)/n_grid bilinear sampling that the // reference implementation uses) and upload it to the graph - // input declared in clip_graph_hunyuanocr::build(). + // input declared in clip_graph_hunyuanvl::build(). GGML_ASSERT(model.position_embeddings != nullptr); ggml_tensor * src_t = model.position_embeddings; const int64_t n_embd = src_t->ne[0]; @@ -4246,7 +4235,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_KIMIK25: case PROJECTOR_TYPE_YASA2: return ctx->model.mm_2_w->ne[1]; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_COGVLM: diff --git a/tools/mtmd/models/hunyuanocr.cpp b/tools/mtmd/models/hunyuanvl.cpp similarity index 70% rename from tools/mtmd/models/hunyuanocr.cpp rename to tools/mtmd/models/hunyuanvl.cpp index 45ed684f70d..2c670979d76 100644 --- a/tools/mtmd/models/hunyuanocr.cpp +++ b/tools/mtmd/models/hunyuanvl.cpp @@ -1,25 +1,15 @@ #include "models.h" -ggml_cgraph * clip_graph_hunyuanocr::build() { +ggml_cgraph * clip_graph_hunyuanvl::build() { const int merge = hparams.n_merge; const int pw = n_patches_x; const int ph = n_patches_y; - // Position embedding interpolation. - // HunyuanVL needs scale factors sf=(target+0.1)/n_grid, which the standard - // ggml_interpolate cannot express. To avoid adding a new ggml op, the - // resize is computed on CPU in clip_image_batch_encode and uploaded here - // as a graph input (named "hunyuanvl_pos_embd"). - // HunyuanOCR uses the same square layout and the standard ratio-based - // interpolation provided by resize_position_embeddings(). - ggml_tensor * pos_embd = nullptr; - if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) { - pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw); - ggml_set_name(pos_embd, "hunyuanvl_pos_embd"); - ggml_set_input(pos_embd); - } else { - pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR); - } + // position embedding: declared as a graph input, filled on CPU + // by clip_image_batch_encode (see PROJECTOR_TYPE_HUNYUANVL branch there). + ggml_tensor * pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw); + ggml_set_name(pos_embd, "hunyuanvl_pos_embd"); + ggml_set_input(pos_embd); ggml_tensor * inp = build_inp(); ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr); diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 955daa6d6d3..3cb824966cc 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -141,8 +141,8 @@ struct clip_graph_glm4v : clip_graph { ggml_cgraph * build() override; }; -struct clip_graph_hunyuanocr : clip_graph { - clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} +struct clip_graph_hunyuanvl : clip_graph { + clip_graph_hunyuanvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 8f12d0b43ea..ca713e59f0a 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -488,7 +488,6 @@ struct mtmd_context { img_end = "\n"; // prevent empty batch on llama-server image_preproc = std::make_unique(ctx_v); } break; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: { // note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary diff --git a/tools/server/README.md b/tools/server/README.md index 11098af2883..fac8334a6c9 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -222,8 +222,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |