From 951fa5ca65c71690cb351da42f0d0e8e92db9eaa Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 3 Jun 2026 16:34:08 +0200 Subject: [PATCH 1/2] add model --- conversion/__init__.py | 2 + conversion/gemma.py | 79 +++++++++++++++++++++++++++++++++- gguf-py/gguf/constants.py | 6 +++ gguf-py/gguf/gguf_writer.py | 3 ++ gguf-py/gguf/tensor_mapping.py | 6 +++ src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-vocab.cpp | 16 +++++++ src/llama-vocab.h | 2 + src/models/gemma4.cpp | 35 +++++++++++++++ tools/mtmd/CMakeLists.txt | 2 + tools/mtmd/clip-impl.h | 5 +++ tools/mtmd/clip-model.h | 8 ++++ tools/mtmd/clip.cpp | 48 ++++++++++++++++++++- tools/mtmd/models/gemma4ua.cpp | 19 ++++++++ tools/mtmd/models/gemma4uv.cpp | 71 ++++++++++++++++++++++++++++++ tools/mtmd/models/models.h | 10 +++++ tools/mtmd/mtmd-audio.cpp | 38 ++++++++++++++++ tools/mtmd/mtmd-audio.h | 6 +++ tools/mtmd/mtmd.cpp | 7 +++ 20 files changed, 362 insertions(+), 3 deletions(-) create mode 100644 tools/mtmd/models/gemma4ua.cpp create mode 100644 tools/mtmd/models/gemma4uv.cpp diff --git a/conversion/__init__.py b/conversion/__init__.py index 8415c65f9432..2c79580f8a36 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -77,6 +77,7 @@ "Gemma3nForConditionalGeneration": "gemma", "Gemma4ForConditionalGeneration": "gemma", "Gemma4ForCausalLM": "gemma", + "Gemma4UnifiedForConditionalGeneration": "gemma", "GemmaForCausalLM": "gemma", "Glm4ForCausalLM": "glm", "Glm4MoeForCausalLM": "glm", @@ -247,6 +248,7 @@ "Gemma3ForConditionalGeneration": "gemma", "Gemma3nForConditionalGeneration": "gemma", "Gemma4ForConditionalGeneration": "gemma", + "Gemma4UnifiedForConditionalGeneration": "gemma", "Glm4vForConditionalGeneration": "qwen3vl", "Glm4vMoeForConditionalGeneration": "qwen3vl", "GlmOcrForConditionalGeneration": "qwen3vl", diff --git a/conversion/gemma.py b/conversion/gemma.py index 76beedcf0d34..538498b939ca 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -3,7 +3,7 @@ import json import re -from typing import Callable, Iterable, TYPE_CHECKING +from typing import Callable, Iterable, TYPE_CHECKING, Sequence import torch @@ -765,6 +765,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("Gemma4UnifiedForConditionalGeneration") +class Gemma4UnifiedModel(Gemma4Model): + model_arch = gguf.MODEL_ARCH.GEMMA4 + + def _get_suppress_tokens(self) -> Sequence[int] | None: + gen_cfg_path = self.dir_model / "generation_config.json" + if gen_cfg_path.is_file(): + with open(gen_cfg_path, encoding="utf-8") as f: + gen_cfg = json.load(f) + return gen_cfg.get("suppress_tokens") + return None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + suppress_tokens = self._get_suppress_tokens() + if suppress_tokens is not None: + self.gguf_writer.add_suppress_tokens(suppress_tokens) + + @ModelBase.register("Gemma4ForConditionalGeneration") class Gemma4VisionAudioModel(MmprojModel): has_audio_encoder = True @@ -839,3 +859,60 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = data_torch.permute(0, 3, 1, 2).contiguous() mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) yield (mapped_name, data_torch) + +@ModelBase.register("Gemma4UnifiedForConditionalGeneration") +class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel): + has_audio_encoder = True + has_vision_encoder = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + assert self.hparams_audio is not None + text_embd_dim = self.hparams_vision["mm_embed_dim"] + self.hparams_vision["hidden_size"] = text_embd_dim + self.hparams_audio["hidden_size"] = text_embd_dim + # this is a transformer-less vision tower, the params below are redundant but set to avoid error + self.hparams_vision["intermediate_size"] = 0 + self.hparams_vision["num_layers"] = 0 + self.hparams_vision["num_attention_heads"] = 0 + self.hparams_audio["intermediate_size"] = 0 + self.hparams_audio["num_layers"] = 0 + self.hparams_audio["num_attention_heads"] = 0 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4UV) + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4UA) + + def modify_tensors(self, data_torch, name, bid): + if name.endswith("pos_embedding"): + name += ".weight" + data_torch = data_torch.permute(1, 0, 2) + elif ".pos_norm." in name: + # rename to patch_ln3 to reuse the tensor name scheme + name = name.replace(".pos_norm.", ".patch_ln3.") + elif "patch_dense.weight" in name: + # ggml im2col outputs in RR..GG..BB.. (CHW) order, but weight expects RGBRGB.. (HWC). + # Permute columns so column i aligns with CHW input position i. + assert self.hparams_vision is not None + p = self.hparams_vision["model_patch_size"] + i = torch.arange(p * p * 3) + ch = i // (p * p) + row = (i % (p * p)) // p + col = i % p + # perm[i] = HWC column index for CHW position i + perm = row * p * 3 + col * 3 + ch + data_torch = data_torch[:, perm] + elif "patch_ln1.weight" in name or "patch_ln1.bias" in name: + # same permutation for patch_ln1 as patch_dense to align with CHW input order + assert self.hparams_vision is not None + p = self.hparams_vision["model_patch_size"] + i = torch.arange(p * p * 3) + ch = i // (p * p) + row = (i % (p * p)) // p + col = i % p + # perm[i] = HWC index for CHW position i + perm = row * p * 3 + col * 3 + ch + data_torch = data_torch[perm] + return super().modify_tensors(data_torch, name, bid) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 207cc2a1933f..ce556ec9b655 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -264,6 +264,7 @@ class Tokenizer: ADD_PREFIX = "tokenizer.ggml.add_space_prefix" REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces" PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap" + SUPPRESS_TOKENS = "tokenizer.ggml.suppress_tokens" HF_JSON = "tokenizer.huggingface.json" RWKV = "tokenizer.rwkv.world" CHAT_TEMPLATE = "tokenizer.chat_template" @@ -731,6 +732,7 @@ class MODEL_TENSOR(IntEnum): V_ENC_EMBD_CLS = auto() V_ENC_EMBD_PATCH = auto() V_ENC_EMBD_NORM = auto() + V_ENC_EMBD_PATCH_NORM = auto() # allow multiple norms in the same embd, e.g. for gemma4u V_ENC_EMBD_POS = auto() V_ENC_INPUT_NORM = auto() V_ENC_ATTN_QKV = auto() @@ -1250,6 +1252,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", MODEL_TENSOR.V_ENC_EMBD_NORM: "v.norm_embd", + MODEL_TENSOR.V_ENC_EMBD_PATCH_NORM: "v.patch_norm.{bid}", MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv", MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", @@ -1431,6 +1434,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_ENC_EMBD_CLS, MODEL_TENSOR.V_ENC_EMBD_PATCH, MODEL_TENSOR.V_ENC_EMBD_NORM, + MODEL_TENSOR.V_ENC_EMBD_PATCH_NORM, MODEL_TENSOR.V_ENC_EMBD_POS, MODEL_TENSOR.V_ENC_EMBD_IMGNL, MODEL_TENSOR.V_ENC_EMBD_VSEP, @@ -4346,6 +4350,8 @@ class VisionProjectorType: GEMMA3NA = "gemma3na" GEMMA4V = "gemma4v" GEMMA4A = "gemma4a" + GEMMA4UV = "gemma4uv" # "unified" variant + GEMMA4UA = "gemma4ua" # "unified" variant PHI4 = "phi4" IDEFICS3 = "idefics3" PIXTRAL = "pixtral" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 63cf6debcc91..875d0f73d964 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1113,6 +1113,9 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) + def add_suppress_tokens(self, tokens: Sequence[int]) -> None: + self.add_array(Keys.Tokenizer.SUPPRESS_TOKENS, tokens) + def add_normalizer_lowercase(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 444f0f2855a6..82f26e7b303d 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1426,6 +1426,7 @@ class TensorNameMap: "model.vision_tower.patch_embedder.input_proj", # gemma4 "vision_tower.patch_embed.patchifier.proj", # dots.ocr "vision_model.conv1", # Step3-VL + "model.vision_embedder.patch_dense", # gemma4 unified ), MODEL_TENSOR.V_ENC_EMBD_NORM: ( @@ -1433,6 +1434,10 @@ class TensorNameMap: "vision_tower.patch_embed.patchifier.norm", # dots.ocr ), + MODEL_TENSOR.V_ENC_EMBD_PATCH_NORM: ( + "model.vision_embedder.patch_ln{bid}", # gemma4 unified + ), + MODEL_TENSOR.V_ENC_EMBD_POS: ( "vision_tower.vision_model.embeddings.position_embedding", "model.vision_tower.embeddings.position_embedding", # minicpmv4_6 @@ -1448,6 +1453,7 @@ class TensorNameMap: "vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL "model.vision_tower.patch_embedder.position_embedding_table", # gemma4 "vision_model.positional_embedding", # Step3-VL + "model.vision_embedder.pos_embedding", # gemma4 unified ), MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index a1d4e6e51970..fea898deaf2c 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -329,6 +329,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" }, { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, + { LLM_KV_TOKENIZER_SUPPRESS_TOKENS, "tokenizer.ggml.suppress_tokens" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 3b80b2ae19fb..f364f6b0bae1 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -318,6 +318,7 @@ enum llm_kv { LLM_KV_TOKENIZER_FIM_PAD_ID, LLM_KV_TOKENIZER_FIM_REP_ID, LLM_KV_TOKENIZER_FIM_SEP_ID, + LLM_KV_TOKENIZER_SUPPRESS_TOKENS, LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 520502398162..9a4bed49487b 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1815,6 +1815,8 @@ struct llama_vocab::impl { // set of all tokens that cause "end of generation" std::set special_eog_ids; + std::vector suppress_tokens; + std::unique_ptr tokenizer; std::vector precompiled_charsmap; @@ -2533,6 +2535,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { // Lowercase normalizer flag (consulted by WPM / whitespace BPE) ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false); + // suppress tokens + { + const int suppress_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SUPPRESS_TOKENS).c_str()); + if (suppress_idx != -1) { + const int n = gguf_get_arr_n(ctx, suppress_idx); + const int32_t * data = (const int32_t *) gguf_get_arr_data(ctx, suppress_idx); + suppress_tokens.assign(data, data + n); + } + } + // auto-detect special tokens by text // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_... // for now, we apply this workaround to find the tokens based on their text @@ -3961,6 +3973,10 @@ bool llama_vocab::get_normalizer_lowercase() const { return pimpl->normalizer_lowercase; } +const std::vector & llama_vocab::get_suppress_tokens() const { + return pimpl->suppress_tokens; +} + int llama_vocab::max_token_len() const { return pimpl->max_token_len; } diff --git a/src/llama-vocab.h b/src/llama-vocab.h index b3991b53228c..2626ae36e33f 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -143,6 +143,8 @@ struct llama_vocab { bool get_treat_whitespace_as_suffix() const; bool get_normalizer_lowercase () const; + const std::vector & get_suppress_tokens() const; + int max_token_len() const; int find_bpe_rank(const std::string & token_left, const std::string & token_right) const; diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index de3f790c71cf..31906de33d97 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -142,6 +142,31 @@ static ggml_tensor * ggml_view_2d_slice(ggml_context * ctx0, ggml_tensor * x, in idx * x->ne[0] * x->ne[1] * ggml_element_size(x)); } +// TODO @ngxson : maybe improve this in the future +class llm_graph_input_logits_bias : public llm_graph_input_i { +public: + llm_graph_input_logits_bias(const llama_vocab & vocab) { + arr.resize(vocab.n_tokens(), 0.0f); + for (llama_token id : vocab.get_suppress_tokens()) { + if (0 <= id && id < (int32_t)vocab.n_tokens()) { + arr[id] = -INFINITY; + } + } + } + virtual ~llm_graph_input_logits_bias() = default; + + void set_input(const llama_ubatch *) override { + const int64_t n_vocab = arr.size(); + ggml_backend_tensor_set(logits_bias, arr.data(), 0, n_vocab*ggml_element_size(logits_bias)); + } + + // bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * logits_bias = nullptr; // F32 [n_vocab] + + std::vector arr; +}; + llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model), @@ -388,6 +413,16 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); } + // apply logits bias if needed (e.g. for gemma4_unified patch) + // this is to mirror the suppress_tokens patch on transformers, to avoid model from outputing and tokens (which is a known issue related to the checkpoint) + // TODO: maybe handle this inside the sampling system in the future + if (!model.vocab.get_suppress_tokens().empty()) { + auto inp_bias = std::make_unique(model.vocab); + inp_bias->logits_bias = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, inp_bias->arr.size()); + cur = ggml_add(ctx0, cur, inp_bias->logits_bias); + res->add_input(std::move(inp_bias)); + } + cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 61510747ccf5..93f005652b7d 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -21,6 +21,8 @@ add_library(mtmd models/exaone4_5.cpp models/gemma4a.cpp models/gemma4v.cpp + models/gemma4ua.cpp + models/gemma4uv.cpp models/glm4v.cpp models/granite-speech.cpp models/hunyuanvl.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index b7fdcc42f1e2..c055cfb75419 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -83,6 +83,7 @@ #define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" #define TN_PATCH_BIAS "v.patch_embd.bias" #define TN_NORM_EMBD "v.norm_embd.%s" +#define TN_PATCH_NORM "v.patch_norm.%d.%s" #define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s" #define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" @@ -317,6 +318,8 @@ enum projector_type { PROJECTOR_TYPE_GEMMA3NA, PROJECTOR_TYPE_GEMMA4V, PROJECTOR_TYPE_GEMMA4A, + PROJECTOR_TYPE_GEMMA4UV, + PROJECTOR_TYPE_GEMMA4UA, PROJECTOR_TYPE_PHI4, PROJECTOR_TYPE_IDEFICS3, PROJECTOR_TYPE_PIXTRAL, @@ -369,6 +372,8 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_GEMMA3NA, "gemma3na"}, { PROJECTOR_TYPE_GEMMA4V, "gemma4v"}, { PROJECTOR_TYPE_GEMMA4A, "gemma4a"}, + { PROJECTOR_TYPE_GEMMA4UV, "gemma4uv"}, + { PROJECTOR_TYPE_GEMMA4UA, "gemma4ua"}, { PROJECTOR_TYPE_PHI4, "phi4"}, { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 1f3657a85077..238f805a9aae 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -339,6 +339,14 @@ struct clip_model { ggml_tensor * norm_embd_w = nullptr; ggml_tensor * norm_embd_b = nullptr; + // "indexed" patch embedding norms + ggml_tensor * patch_norm_1_w = nullptr; + ggml_tensor * patch_norm_1_b = nullptr; + ggml_tensor * patch_norm_2_w = nullptr; + ggml_tensor * patch_norm_2_b = nullptr; + ggml_tensor * patch_norm_3_w = nullptr; + ggml_tensor * patch_norm_3_b = nullptr; + ggml_tensor * pre_ln_w = nullptr; ggml_tensor * pre_ln_b = nullptr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3eeda41155d8..1abde5fb5f37 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -866,6 +866,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_GEMMA4UV: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: { @@ -969,6 +973,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_GEMMA4UA: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_GRANITE_SPEECH: { builder = std::make_unique(ctx, img); @@ -1386,13 +1394,19 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_GEMMA4V: + case PROJECTOR_TYPE_GEMMA4UV: { hparams.rope_theta = 100.0f; hparams.n_merge = 3; // pooling_kernel_size hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + if (model.proj_type == PROJECTOR_TYPE_GEMMA4UV) { + // for "unified" variant, we directly use a bigger patch size, because the "token merging" is done directly on conv layer + hparams.patch_size = hparams.patch_size * hparams.n_merge; + hparams.n_merge = 1; + } // @ngxson : the model performs quite poor with small images, we need to bump minimum image tokens to 40 to avoid that - hparams.set_limit_image_tokens(252, 280); + hparams.set_limit_image_tokens(40, 280); hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; @@ -1586,6 +1600,14 @@ struct clip_model_loader { // since all gemma4a models use 1e-6, we just hardcode it here to avoid re-conversion hparams.eps = 1e-6f; } break; + case PROJECTOR_TYPE_GEMMA4UA: + { + // Encoder-free: raw 16 kHz waveform chunked into 640-sample frames. + hparams.audio_chunk_len = 0; + hparams.audio_sample_rate = 16000; + hparams.eps = 1e-6f; + hparams.n_mel_bins = 640; + } break; case PROJECTOR_TYPE_GRANITE_SPEECH: { hparams.audio_chunk_len = 0; @@ -2097,6 +2119,16 @@ struct clip_model_loader { } } } break; + case PROJECTOR_TYPE_GEMMA4UV: + { + model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); + model.patch_norm_1_w = get_tensor(string_format(TN_PATCH_NORM, 1, "weight")); + model.patch_norm_1_b = get_tensor(string_format(TN_PATCH_NORM, 1, "bias")); + model.patch_norm_2_w = get_tensor(string_format(TN_PATCH_NORM, 2, "weight")); + model.patch_norm_2_b = get_tensor(string_format(TN_PATCH_NORM, 2, "bias")); + model.patch_norm_3_w = get_tensor(string_format(TN_PATCH_NORM, 3, "weight")); // pos_norm + model.patch_norm_3_b = get_tensor(string_format(TN_PATCH_NORM, 3, "bias")); // pos_norm + } break; case PROJECTOR_TYPE_GEMMA3NV: { model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false); @@ -2510,6 +2542,10 @@ struct clip_model_loader { } } } break; + case PROJECTOR_TYPE_GEMMA4UA: + { + model.mm_input_proj_w = get_tensor(string_format(TN_A_MM_INP_PROJ, "weight")); + } break; case PROJECTOR_TYPE_LFM2A: { for (int i : {0, 2, 3, 5, 6}) { @@ -3218,6 +3254,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA4V: + case PROJECTOR_TYPE_GEMMA4UV: case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_NEMOTRON_V2_VL: @@ -3350,6 +3387,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } n_patches = n; } break; + case PROJECTOR_TYPE_GEMMA4UA: + { + n_patches = img->nx; // no downsampling: one token per raw waveform frame + } break; case PROJECTOR_TYPE_GRANITE_SPEECH: { const int ws = ctx->model.hparams.audio_proj_window_size; @@ -3917,6 +3958,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("patches", patches); } break; case PROJECTOR_TYPE_GEMMA4V: + case PROJECTOR_TYPE_GEMMA4UV: { // set (col, row) patch positions for learned positional embedding const int n_cols = image_size_width / patch_size; @@ -3998,6 +4040,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_YASA2: + case PROJECTOR_TYPE_GEMMA4UA: { // do nothing } break; @@ -4303,6 +4346,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_GEMMA3NV: return ctx->model.mm_input_proj_w->ne[0]; case PROJECTOR_TYPE_GEMMA4V: + case PROJECTOR_TYPE_GEMMA4UV: return ctx->model.mm_input_proj_w->ne[1]; case PROJECTOR_TYPE_IDEFICS3: return ctx->model.mm_fc_w->ne[1]; @@ -4337,7 +4381,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_fc_w->ne[1]; case PROJECTOR_TYPE_LFM2A: return ctx->model.position_embeddings->ne[0]; - case PROJECTOR_TYPE_GEMMA4A: + case PROJECTOR_TYPE_GEMMA4UA: return ctx->model.hparams.projection_dim; case PROJECTOR_TYPE_GRANITE_SPEECH: return ctx->model.qf_proj_linear_w->ne[1]; diff --git a/tools/mtmd/models/gemma4ua.cpp b/tools/mtmd/models/gemma4ua.cpp new file mode 100644 index 000000000000..e24bef2ed7b4 --- /dev/null +++ b/tools/mtmd/models/gemma4ua.cpp @@ -0,0 +1,19 @@ +#include "models.h" +#include + +ggml_cgraph * clip_graph_gemma4ua::build() { + ggml_tensor * inp = build_inp_raw(1); + + auto cur = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + + // Gemma4UnifiedMultimodalEmbedder + { + // embedding_pre_projection_norm + cur = ggml_rms_norm(ctx0, cur, hparams.eps); + cur = build_mm(model.mm_input_proj_w, cur); + cb(cur, "projected", -1); + } + + ggml_build_forward_expand(gf, cur); + return gf; +} diff --git a/tools/mtmd/models/gemma4uv.cpp b/tools/mtmd/models/gemma4uv.cpp new file mode 100644 index 000000000000..96031141b175 --- /dev/null +++ b/tools/mtmd/models/gemma4uv.cpp @@ -0,0 +1,71 @@ +#include "models.h" +#include + +ggml_cgraph * clip_graph_gemma4uv::build() { + ggml_tensor * inp_raw = build_inp_raw(); + + // Gemma4UnifiedVisionEmbedder uses default pytorch LayerNorm, not RMSNorm + float eps = 1e-5f; // default eps for pytorch LayerNorm + + ggml_tensor * inp = nullptr; + { + // note: we cannot use ggml_conv_2d here because we need to apply norm after im2col + auto c = inp_raw->ne[2]; + ggml_tensor * kernel = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, patch_size, patch_size, c); + inp = ggml_im2col(ctx0, kernel, inp_raw, patch_size, patch_size, 0, 0, 1, 1, true, inp_raw->type); + // inp shape: [patch_size * patch_size * c, n_patches_w, n_patches_h] + + inp = ggml_reshape_2d(ctx0, inp, inp->ne[0], inp->ne[1] * inp->ne[2] * inp->ne[3]); + inp = build_norm(inp, model.patch_norm_1_w, model.patch_norm_1_b, NORM_TYPE_NORMAL, eps, -1); + // inp shape: [patch_size * patch_size * c, n_patches] + + inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); + inp = ggml_add(ctx0, inp, model.patch_bias); + // inp shape: [n_embd, n_patches] + + inp = build_norm(inp, model.patch_norm_2_w, model.patch_norm_2_b, NORM_TYPE_NORMAL, eps, -1); + } + + ggml_tensor * pos_x = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_x, "pos_x"); + ggml_set_input(pos_x); + + ggml_tensor * pos_y = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_y, "pos_y"); + ggml_set_input(pos_y); + + { + const int64_t pos_size = model.position_embeddings->ne[1]; + const size_t nb1 = ggml_row_size(model.position_embeddings->type, n_embd); + + // positional embeddings are stored as lookup tables (one for x, one for y) + ggml_tensor * tbl_x = ggml_view_2d(ctx0, model.position_embeddings, + n_embd, pos_size, nb1, 0); + ggml_tensor * tbl_y = ggml_view_2d(ctx0, model.position_embeddings, + n_embd, pos_size, nb1, pos_size * nb1); + + // ggml_get_rows: [n_embd, n_patches] + ggml_tensor * emb_x = ggml_get_rows(ctx0, tbl_x, pos_x); + ggml_tensor * emb_y = ggml_get_rows(ctx0, tbl_y, pos_y); + + inp = ggml_add(ctx0, inp, emb_x); + inp = ggml_add(ctx0, inp, emb_y); + cb(inp, "pos_embd", -1); + + // pos_norm + inp = build_norm(inp, model.patch_norm_3_w, model.patch_norm_3_b, NORM_TYPE_NORMAL, eps, -1); + } + + auto cur = inp; + + // Gemma4UnifiedMultimodalEmbedder + { + // embedding_pre_projection_norm + cur = ggml_rms_norm(ctx0, cur, hparams.eps); + cur = build_mm(model.mm_input_proj_w, cur); + cb(cur, "projected", -1); + } + + ggml_build_forward_expand(gf, cur); + return gf; +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index da37bc650507..b882f800dd77 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -18,6 +18,11 @@ struct clip_graph_gemma4v : clip_graph { ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override; }; +struct clip_graph_gemma4uv : clip_graph { + clip_graph_gemma4uv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + struct clip_graph_pixtral : clip_graph { clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; @@ -142,6 +147,11 @@ struct clip_graph_gemma4a : clip_graph { ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override; }; +struct clip_graph_gemma4ua : clip_graph { + clip_graph_gemma4ua(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + struct clip_graph_glm4v : clip_graph { clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 85352904739f..13f211fd9021 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -942,6 +942,44 @@ bool mtmd_audio_preprocessor_gemma4a::preprocess(const float * s return true; } +// +// mtmd_audio_preprocessor_gemma4ua +// + +void mtmd_audio_preprocessor_gemma4ua::initialize() { + // no-op: no FFT or filterbank needed +} + +bool mtmd_audio_preprocessor_gemma4ua::preprocess(const float * samples, + size_t n_samples, + std::vector & output) { + if (n_samples == 0) { + return false; + } + + const int frame_size = hparams.n_mel_bins; // 640 samples per token @ 16 kHz = 40 ms + const int n_tokens = ((int)n_samples + frame_size - 1) / frame_size; + + mtmd_audio_mel mel; + mel.n_len = n_tokens; + mel.n_len_org = n_tokens; + mel.n_mel = frame_size; + mel.data.assign((size_t)frame_size * n_tokens, 0.0f); + + // Store mel-major (data[f * n_tokens + t]) so the ggml tensor loads as + // [n_tokens, frame_size] with ne[0]=n_tokens, ne[1]=frame_size. + // The graph builder transposes before RMSNorm so normalization is over frame_size. + for (int t = 0; t < n_tokens; t++) { + for (int f = 0; f < frame_size; f++) { + size_t src = (size_t)t * frame_size + f; + mel.data[(size_t)f * n_tokens + t] = (src < n_samples) ? samples[src] : 0.0f; + } + } + + output.push_back(std::move(mel)); + return true; +} + // // mtmd_audio_streaming_istft implementation // diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index 98ccb6424150..9656e3940f53 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -96,6 +96,12 @@ struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor { mtmd_audio_cache cache; }; +struct mtmd_audio_preprocessor_gemma4ua : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_gemma4ua(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; +}; + struct mtmd_audio_preprocessor_qwen3a : mtmd_audio_preprocessor { mtmd_audio_preprocessor_qwen3a(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} void initialize() override; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 94c3be82f0fe..3d4fa27d2798 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -482,6 +482,7 @@ struct mtmd_context { image_preproc = std::make_unique(ctx_v); } break; case PROJECTOR_TYPE_GEMMA4V: + case PROJECTOR_TYPE_GEMMA4UV: { // <|image> ... (image embeddings) ... img_beg = "<|image>"; @@ -576,6 +577,12 @@ struct mtmd_context { aud_end = ""; audio_preproc = std::make_unique(ctx_a); } break; + case PROJECTOR_TYPE_GEMMA4UA: + { + aud_beg = "<|audio>"; + aud_end = ""; + audio_preproc = std::make_unique(ctx_a); + } break; default: throw std::runtime_error(string_format("%s: unexpected audio projector type %d\n", __func__, proj)); } From 5aa6625f7269b86d0c37ce0f6aef0e291b913d9a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 3 Jun 2026 16:38:28 +0200 Subject: [PATCH 2/2] nits --- conversion/gemma.py | 1 + 1 file changed, 1 insertion(+) diff --git a/conversion/gemma.py b/conversion/gemma.py index 538498b939ca..2025e782b7f1 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -860,6 +860,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) yield (mapped_name, data_torch) + @ModelBase.register("Gemma4UnifiedForConditionalGeneration") class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel): has_audio_encoder = True