From 0024aa7417efd50a7c2a79e049dfbbbceb0beaaf Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 29 Apr 2026 15:07:50 +0200 Subject: [PATCH 01/11] mtmd : add Nemotron 3 Nano Omni support (parakeet) This commit adds support for the subsampling and encoder part of Nemotron Nemo 3 omni model. The Parakeet subsampling/encoder were taken from parakeet.cpp which is currently a pull request against whisper.cpp. I've tried to copy the code a close as possible to hopefully enable easy patching between the these two project later. Refs: https://github.com/ggml-org/whisper.cpp/pull/3735 --- convert_hf_to_gguf.py | 77 +++++++- gguf-py/gguf/constants.py | 14 ++ gguf-py/gguf/gguf_writer.py | 3 + gguf-py/gguf/tensor_mapping.py | 40 ++++ tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip-impl.h | 10 + tools/mtmd/clip-model.h | 26 ++- tools/mtmd/clip.cpp | 109 +++++++++++ tools/mtmd/clip.h | 4 + tools/mtmd/models/models.h | 8 + tools/mtmd/models/parakeet.cpp | 329 +++++++++++++++++++++++++++++++++ tools/mtmd/mtmd-audio.cpp | 226 ++++++++++++++++++++++ tools/mtmd/mtmd-audio.h | 10 + tools/mtmd/mtmd.cpp | 4 + 14 files changed, 850 insertions(+), 11 deletions(-) create mode 100644 tools/mtmd/models/parakeet.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 90c2b7094c71..667cc7ab345b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4463,15 +4463,28 @@ def dequant_model(self): return super().dequant_model() + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config.get("sound_config") + def set_gguf_parameters(self): if "image_mean" not in self.preprocessor_config: self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406] if "image_std" not in self.preprocessor_config: self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225] + if self.hparams_audio is not None: + self.has_vision_encoder = True + self.has_audio_encoder = True + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + self.gguf_writer.add_audio_subsampling_factor(self.hparams_audio["subsampling_factor"]) + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.PARAKEET) + self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL) + else: + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL) + super().set_gguf_parameters() hparams = self.global_config - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL) self.gguf_writer.add_vision_attention_layernorm_eps(1e-6) self.gguf_writer.add_vision_use_gelu(True) downsample_ratio = hparams.get("downsample_ratio", 0.5) @@ -4480,12 +4493,45 @@ def set_gguf_parameters(self): def tensor_force_quant(self, name, new_name, bid, n_dims): if ".position_embd." in new_name or "pos_embed" in new_name: return gguf.GGMLQuantizationType.F32 + + if "sound_encoder" in name or new_name.startswith("mm.a."): + if "bias" in new_name or "norm" in new_name: + return gguf.GGMLQuantizationType.F32 + if "conv" in new_name and "weight" in new_name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + sound_config = self.global_config.get("sound_config") + if sound_config: + # Generate relative position embeddings. + d_model = sound_config.get("hidden_size") + max_len = 5000 * 2 - 1 + + pe = torch.zeros(max_len, d_model, dtype=torch.float32) + log_10000 = math.log(10000.0) + + for idx in range(max_len): + position = float((max_len // 2) - idx) + + for i in range(0, d_model, 2): + div_term = math.exp(-(float(i) * log_10000 / float(d_model))) + angle = position * div_term + + pe[idx, i] = math.sin(angle) + if i + 1 < d_model: + pe[idx, i + 1] = math.cos(angle) + + yield ("a.position_embd.weight", pe) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if "input_conditioner" in name: return + if "language_model" in name: + return + # mtmd does not support video yet so skip tensors related to video. if "radio_model.model.patch_generator.video_embedder" in name: return @@ -4517,8 +4563,33 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_embd = self.hparams["hidden_size"] data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size) - if name.startswith("vision_model.radio_model.model.") or name.startswith("mlp1."): - yield from super().modify_tensors(data_torch, name, bid) + # num_batches is only use for training not inference. + if "conv.norm" in name and "num_batches" in name: + return + + if "depthwise_conv.weight" in name: + data_torch = data_torch.unsqueeze(-1) + data_torch = data_torch.permute(3, 1, 0, 2).contiguous() + + if "pointwise_conv" in name and name.endswith(".weight"): + if len(data_torch.shape) == 3 and data_torch.shape[2] == 1: + data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1]) + + if "subsampling.layers" in name and name.endswith(".bias"): + if len(data_torch.shape) == 1: + data_torch = data_torch.reshape(1, -1, 1, 1) + + if "pointwise_conv" in name and name.endswith(".bias"): + if len(data_torch.shape) == 1: + data_torch = data_torch.reshape(1, -1, 1, 1) + + if name.startswith(("vision_model.radio_model.model.", "mlp1.", "sound_encoder.", "sound_projection.")): + for mapped_name, tensor in super().modify_tensors(data_torch, name, bid): + if name.startswith("sound_projection.") and mapped_name.startswith("mm.model.mlp."): + mapped_name = mapped_name.replace("mm.model.mlp.", "mm.a.mlp.") + yield (mapped_name, tensor) + else: + yield (name, data_torch) @ModelBase.register("WavTokenizerDec") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 83ae51ce9ce3..e7ecb9816e1d 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -339,6 +339,7 @@ class ClipAudio: FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length" PROJECTION_DIM = "clip.audio.projection_dim" BLOCK_COUNT = "clip.audio.block_count" + SUBSAMPLING_FACTOR = "clip.audio.subsampling_factor" class Attention: HEAD_COUNT = "clip.audio.attention.head_count" @@ -854,6 +855,10 @@ class MODEL_TENSOR(IntEnum): A_ENC_CONV_NORM = auto() # SSM conv A_ENC_CONV_PW1 = auto() A_ENC_CONV_PW2 = auto() + A_ENC_CONV_NORM_MEAN = auto() # parakeet + A_ENC_CONV_NORM_VAR = auto() # parakeet + A_ENC_MEL_FILTERS = auto() # parakeet + A_ENC_WINDOW = auto() # parakeet MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -1333,6 +1338,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm", MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1", MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2", + MODEL_TENSOR.A_ENC_CONV_NORM_MEAN: "a.blk.{bid}.conv_norm_mean", + MODEL_TENSOR.A_ENC_CONV_NORM_VAR: "a.blk.{bid}.conv_norm_var", + MODEL_TENSOR.A_ENC_MEL_FILTERS: "a.mel_filters", + MODEL_TENSOR.A_ENC_WINDOW: "a.window", # NextN/MTP MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj", MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens", @@ -1474,6 +1483,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_ENC_CONV_NORM, MODEL_TENSOR.A_ENC_CONV_PW1, MODEL_TENSOR.A_ENC_CONV_PW2, + MODEL_TENSOR.A_ENC_CONV_NORM_MEAN, + MODEL_TENSOR.A_ENC_CONV_NORM_VAR, + MODEL_TENSOR.A_ENC_MEL_FILTERS, + MODEL_TENSOR.A_ENC_WINDOW, MODEL_TENSOR.A_MM_INP_PROJ, MODEL_TENSOR.A_MM_SOFT_EMB_NORM, MODEL_TENSOR.A_MM_EMBEDDING, @@ -4158,6 +4171,7 @@ class VisionProjectorType: NEMOTRON_V2_VL = "nemotron_v2_vl" HUNYUANOCR = "hunyuanocr" HUNYUANVL = "hunyuanvl" + PARAKEET = "parakeet" # Items here are (block size, type size) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 6a81ca37d8c4..6a7284ad8e5f 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1260,6 +1260,9 @@ def add_audio_num_mel_bins(self, value: int) -> None: def add_audio_stack_factor(self, value: int) -> None: self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value) + def add_audio_subsampling_factor(self, value: int) -> None: + self.add_uint32(Keys.ClipAudio.SUBSAMPLING_FACTOR, value) + def add_xielu_alpha_p(self, values: Sequence[float]): self.add_array(Keys.xIELU.ALPHA_P, values) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 01a9b236000b..78634a671093 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1882,6 +1882,7 @@ class TensorNameMap: "conformer.pre_encode.conv.{bid}", # lfm2 "model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n "conformer.subsample_conv_projection.layer{bid}.conv", # gemma4 + "sound_encoder.encoder.subsampling.layers.{bid}", # parakeet ), MODEL_TENSOR.A_ENC_CONV1D_NORM: ( @@ -1912,6 +1913,7 @@ class TensorNameMap: "conformer.layers.{bid}.self_attn.linear_q", # lfm2 "conformer.layers.{bid}.attention.attn.q_proj", # gemma3n "conformer.layers.{bid}.self_attn.q_proj", # gemma4 + "sound_encoder.encoder.layers.{bid}.self_attn.q_proj", # parakeet ), MODEL_TENSOR.A_ENC_ATTN_K: ( @@ -1919,6 +1921,7 @@ class TensorNameMap: "conformer.layers.{bid}.self_attn.linear_k", # lfm2 "conformer.layers.{bid}.attention.attn.k_proj", # gemma3n "conformer.layers.{bid}.self_attn.k_proj", # gemma4 + "sound_encoder.encoder.layers.{bid}.self_attn.k_proj", # parakeet ), MODEL_TENSOR.A_ENC_ATTN_V: ( @@ -1926,6 +1929,7 @@ class TensorNameMap: "conformer.layers.{bid}.self_attn.linear_v", # lfm2 "conformer.layers.{bid}.attention.attn.v_proj", # gemma3n "conformer.layers.{bid}.self_attn.v_proj", # gemma4 + "sound_encoder.encoder.layers.{bid}.self_attn.v_proj", # parakeet ), MODEL_TENSOR.A_ENC_ATTN_K_REL: ( @@ -1953,6 +1957,7 @@ class TensorNameMap: "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox "conformer.layers.{bid}.norm_self_att", # lfm2 "conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n + "sound_encoder.encoder.layers.{bid}.norm_self_att", # parakeet ), MODEL_TENSOR.A_ENC_OUTPUT: ( @@ -1960,18 +1965,21 @@ class TensorNameMap: "conformer.layers.{bid}.self_attn.linear_out", # lfm2 "conformer.layers.{bid}.attention.post", # gemma3n "conformer.layers.{bid}.self_attn.post", # gemma4 + "sound_encoder.encoder.layers.{bid}.self_attn.o_proj", # parakeet ), MODEL_TENSOR.A_ENC_OUTPUT_NORM: ( "audio_tower.layers.{bid}.final_layer_norm", # ultravox "conformer.layers.{bid}.norm_out", # lfm2 "conformer.layers.{bid}.attention.post_norm", # gemma3n + "sound_encoder.encoder.layers.{bid}.norm_out", # parakeet ), MODEL_TENSOR.A_ENC_FFN_NORM: ( "conformer.layers.{bid}.norm_feed_forward1", # lfm2 "conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n "conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4 + "sound_encoder.encoder.layers.{bid}.norm_feed_forward1", # parakeet ), MODEL_TENSOR.A_ENC_FFN_POST_NORM: ( @@ -1988,6 +1996,7 @@ class TensorNameMap: "conformer.layers.{bid}.feed_forward1.linear1", # lfm2 "conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n "conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4 + "sound_encoder.encoder.layers.{bid}.feed_forward1.linear1", # parakeet ), MODEL_TENSOR.A_ENC_FFN_GATE: (), @@ -1997,24 +2006,28 @@ class TensorNameMap: "conformer.layers.{bid}.feed_forward1.linear2", # lfm2 "conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n "conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4 + "sound_encoder.encoder.layers.{bid}.feed_forward1.linear2", # parakeet ), MODEL_TENSOR.A_ENC_FFN_UP_1: ( "conformer.layers.{bid}.feed_forward2.linear1", # lfm2 "conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n "conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4 + "sound_encoder.encoder.layers.{bid}.feed_forward2.linear1", # parakeet ), MODEL_TENSOR.A_ENC_FFN_DOWN_1: ( "conformer.layers.{bid}.feed_forward2.linear2", # lfm2 "conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n "conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4 + "sound_encoder.encoder.layers.{bid}.feed_forward2.linear2", # parakeet ), MODEL_TENSOR.A_ENC_FFN_NORM_1: ( "conformer.layers.{bid}.norm_feed_forward2", # lfm2 "conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n "conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4 + "sound_encoder.encoder.layers.{bid}.norm_feed_forward2", # parakeet ), MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: ( @@ -2029,20 +2042,24 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_LINEAR_POS: ( "conformer.layers.{bid}.self_attn.linear_pos", # lfm2 "conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n + "sound_encoder.encoder.layers.{bid}.self_attn.relative_k_proj", # parakeet ), MODEL_TENSOR.A_ENC_POS_BIAS_U: ( "conformer.layers.{bid}.self_attn.pos_bias_u", # lfm2 + "sound_encoder.encoder.layers.{bid}.self_attn.bias_u", # parakeet ), MODEL_TENSOR.A_ENC_POS_BIAS_V: ( "conformer.layers.{bid}.self_attn.pos_bias_v", # lfm2 + "sound_encoder.encoder.layers.{bid}.self_attn.bias_v", # parakeet ), MODEL_TENSOR.A_ENC_OUT: ( "conformer.pre_encode.out", # lfm2 "model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n (note: it should be A_ENC_INP_PROJ, this is a mistake; it should be corrected in C++ code when it's supported) "conformer.output_proj", # gemma4 + "sound_encoder.encoder.subsampling.linear", # parakeet ), # note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors @@ -2052,6 +2069,7 @@ class TensorNameMap: "audio.multi_modal_projector.linear_{bid}", # ultravox, meralion "audio_adapter.model.{bid}", # lfm2 "audio_tower.proj{bid}", # qwen3omni + "sound_projection.linear{bid}", # parakeet (linear1, linear2) ), MODEL_TENSOR.A_MMPROJ_FC: ( @@ -2062,6 +2080,7 @@ class TensorNameMap: MODEL_TENSOR.A_MM_NORM_PRE: ( "audio.multi_modal_projector.ln_pre", # ultravox + "sound_projection.norm", # parakeet ), MODEL_TENSOR.A_MM_NORM_MID: ( @@ -2071,26 +2090,39 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_CONV_DW: ( "conformer.layers.{bid}.conv.depthwise_conv", # lfm2 "conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n + "sound_encoder.encoder.layers.{bid}.conv.depthwise_conv", # parakeet ), MODEL_TENSOR.A_ENC_CONV_NORM: ( "conformer.layers.{bid}.conv.batch_norm", # lfm2 "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n + "sound_encoder.encoder.layers.{bid}.conv.norm", # parakeet + ), + + MODEL_TENSOR.A_ENC_CONV_NORM_MEAN: ( + "sound_encoder.encoder.layers.{bid}.conv.norm.running_mean", # parakeet + ), + + MODEL_TENSOR.A_ENC_CONV_NORM_VAR: ( + "sound_encoder.encoder.layers.{bid}.conv.norm.running_var", # parakeet ), MODEL_TENSOR.A_ENC_CONV_PW1: ( "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2 "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n + "sound_encoder.encoder.layers.{bid}.conv.pointwise_conv1", # parakeet ), MODEL_TENSOR.A_ENC_CONV_PW2: ( "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2 "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n + "sound_encoder.encoder.layers.{bid}.conv.pointwise_conv2", # parakeet ), MODEL_TENSOR.A_ENC_NORM_CONV: ( "conformer.layers.{bid}.norm_conv", # lfm2 "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n + "sound_encoder.encoder.layers.{bid}.norm_conv", # parakeet ), MODEL_TENSOR.A_PER_DIM_K_SCALE: ( @@ -2101,6 +2133,14 @@ class TensorNameMap: "conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma4 ), + MODEL_TENSOR.A_ENC_MEL_FILTERS: ( + "sound_encoder.encoder.feature_extractor.featurizer.fb", # parakeet + ), + + MODEL_TENSOR.A_ENC_WINDOW: ( + "sound_encoder.encoder.feature_extractor.featurizer.window", # parakeet + ), + MODEL_TENSOR.A_MM_EMBEDDING: ( "model.embed_audio.embedding", # gemma3n ), diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 35d721d5a4c6..0c5f3a41211e 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -41,6 +41,7 @@ add_library(mtmd models/mobilenetv5.cpp models/youtuvl.cpp models/yasa2.cpp + models/parakeet.cpp ) set_target_properties(mtmd PROPERTIES diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 7d6484eea850..16f763c5b29e 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -64,6 +64,8 @@ #define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" #define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor" +#define KEY_AUDIO_SUBSAMPLING_FACTOR "clip.audio.subsampling_factor" + // // tensor name constants @@ -252,6 +254,12 @@ #define TN_YASA_STAGE_DOWN_CONV "v.stage.%d.down.conv.%s" #define TN_YASA_STAGE_BLK "v.stage.%d.blk.%d.%s.%s" +// parakeet +#define TN_MEL_FILTERS "a.mel_filters" +#define TN_WINDOW "a.window" +#define TN_CONV_NORM_MEAN "%s.blk.%d.conv_norm_mean" +#define TN_CONV_NORM_VAR "%s.blk.%d.conv_norm_var" + // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) @@ -304,6 +312,7 @@ enum projector_type { PROJECTOR_TYPE_NEMOTRON_V2_VL, PROJECTOR_TYPE_HUNYUANOCR, PROJECTOR_TYPE_HUNYUANVL, + PROJECTOR_TYPE_PARAKEET, PROJECTOR_TYPE_UNKNOWN, }; @@ -351,6 +360,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"}, { PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"}, + { PROJECTOR_TYPE_PARAKEET, "parakeet"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index bf8031b55b28..62c40be8d45e 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -92,6 +92,7 @@ struct clip_hparams { // audio int32_t n_mel_bins = 0; // whisper preprocessor int32_t proj_stack_factor = 0; // ultravox + int32_t subsampling_factor = 0; // parakeet // audio-to-mel preprocessor params int32_t audio_chunk_len = -1; // in seconds @@ -208,14 +209,18 @@ struct clip_layer { ggml_tensor * norm_conv_b = nullptr; ggml_tensor * linear_pos_w = nullptr; - ggml_tensor * conv_norm_w = nullptr; - ggml_tensor * conv_norm_b = nullptr; - ggml_tensor * conv_dw_w = nullptr; - ggml_tensor * conv_dw_b = nullptr; - ggml_tensor * conv_pw1_w = nullptr; - ggml_tensor * conv_pw1_b = nullptr; - ggml_tensor * conv_pw2_w = nullptr; - ggml_tensor * conv_pw2_b = nullptr; + ggml_tensor * conv_norm_w = nullptr; + ggml_tensor * conv_norm_b = nullptr; + ggml_tensor * conv_norm_mean = nullptr; // parakeet + ggml_tensor * conv_norm_var = nullptr; // parakeet + ggml_tensor * conv_dw_w = nullptr; + ggml_tensor * conv_dw_b = nullptr; + ggml_tensor * conv_pw1_w = nullptr; + ggml_tensor * conv_pw1_b = nullptr; + ggml_tensor * conv_pw2_w = nullptr; + ggml_tensor * conv_pw2_b = nullptr; + + struct ggml_tensor * attn_pos_w; // gemma4 audio conformer per-layer ggml_tensor * attn_pre_norm_w = nullptr; @@ -485,6 +490,11 @@ struct clip_model { ggml_tensor * net_2; ggml_tensor * net_3; + // Parakeet + ggml_tensor * mm_norm_w = nullptr; + ggml_tensor * mel_filters = nullptr; + ggml_tensor * window = nullptr; + int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder std::vector sam_layers; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 45e39898d822..971b69b374e9 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -952,6 +952,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_PARAKEET: + { + builder = std::make_unique(ctx, img); + } break; default: GGML_ABORT("missing cgraph builder"); } @@ -1243,6 +1247,15 @@ struct clip_model_loader { { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); } break; + case PROJECTOR_TYPE_PARAKEET: + { + get_u32(KEY_AUDIO_SUBSAMPLING_FACTOR, hparams.subsampling_factor, false); + hparams.audio_chunk_len = 0; + hparams.audio_sample_rate = 16000; + hparams.audio_n_fft = 512; + hparams.audio_window_len = 400; + hparams.audio_hop_len = 160; + } break; case PROJECTOR_TYPE_IDEFICS3: { // use default llava-uhd preprocessing params @@ -2415,6 +2428,69 @@ struct clip_model_loader { layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias")); } } break; + case PROJECTOR_TYPE_PARAKEET: + { + // Preprocessing tensors + model.mel_filters = get_tensor(TN_MEL_FILTERS); + model.window = get_tensor(TN_WINDOW); + + // Subsampling layers (conv1d) + for (int i : {0, 2, 3, 5, 6}) { + model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight")); + model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias")); + } + model.pre_encode_out_w = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight")); + model.pre_encode_out_b = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias")); + + // Projection layers + model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"), false); + model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"), false); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"), false); + + // Encoder layers + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = model.layers[il]; + + // Attention (from shared above) + + // Relative position encoding + layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight")); + layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il)); + layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il)); + + // Convolution module + layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight")); + layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias"), false); + layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight")); + layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias"), false); + layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight")); + layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"), false); + layer.conv_norm_mean = get_tensor(string_format(TN_CONV_NORM_MEAN, prefix, il), false); + layer.conv_norm_var = get_tensor(string_format(TN_CONV_NORM_VAR, prefix, il), false); + layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight")); + layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"), false); + + // Feed-forward networks + layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight")); + layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias"), false); + + layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight")); + layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"), false); + layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight")); + layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias"), false); + layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight")); + layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"), false); + + // Layer norms + layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight")); + layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"), false); + } + + model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); + model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); + model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight")); + + } break; default: GGML_ASSERT(false && "unknown projector type"); } @@ -3105,6 +3181,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } n_patches = n; } break; + case PROJECTOR_TYPE_PARAKEET: + { + n_patches = (img->nx + (params.subsampling_factor - 1)) / params.subsampling_factor; + } break; default: GGML_ABORT("unsupported projector type"); } @@ -3225,7 +3305,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const auto & mel_inp = imgs.entries[0]; const int n_step = mel_inp->nx; const int n_mel = mel_inp->ny; + std::vector inp_raw(n_step * n_mel); + std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float)); set_input_f32("inp_raw", inp_raw); } @@ -3701,6 +3783,23 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } set_input_f32("pos_emb", pos_emb); } break; + case PROJECTOR_TYPE_PARAKEET: + { + struct ggml_tensor * attn_mask = ggml_graph_get_tensor(gf, "attn_mask"); + const int n_q = attn_mask->ne[1]; + const int n_k = attn_mask->ne[0]; + const int n_tokens_real = (1101 + hparams.subsampling_factor-1) / hparams.subsampling_factor; + const float mask_value = -1e30f; + + std::vector mask_data(n_q * n_k); + for (int q = 0; q < n_q; ++q) { + for (int k = 0; k < n_k; ++k) { + bool is_padding = (k >= n_tokens_real); + mask_data[q * n_k + k] = (is_padding) ? mask_value : 0.0f; + } + } + ggml_backend_tensor_set(attn_mask, mask_data.data(), 0, mask_data.size() * sizeof(float)); + } break; default: GGML_ABORT("Unknown projector type"); } @@ -3851,6 +3950,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.hparams.projection_dim; case PROJECTOR_TYPE_GLM4V: return ctx->model.mm_ffn_down_w->ne[1]; + case PROJECTOR_TYPE_PARAKEET: + return ctx->model.mm_1_w->ne[1]; default: GGML_ABORT("Unknown projector type"); } @@ -3932,6 +4033,14 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { return &ctx->model.hparams; } +struct ggml_tensor * clip_get_mel_filter_tensor(const struct clip_ctx * ctx) { + return ctx->model.mel_filters; +} + +struct ggml_tensor * clip_get_window_tensor(const struct clip_ctx * ctx) { + return ctx->model.window; +} + // // API for debugging // diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index a859b38658d3..e525a5224be6 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -116,3 +116,7 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); bool clip_has_whisper_encoder(const struct clip_ctx * ctx); + +struct ggml_tensor * clip_get_mel_filter_tensor(const struct clip_ctx * ctx); + +struct ggml_tensor * clip_get_window_tensor(const struct clip_ctx * ctx); diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index c30d79133efe..7dbf0345e7e2 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -171,3 +171,11 @@ struct clip_graph_kimik25 : clip_graph { ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode); }; + +struct clip_graph_parakeet : clip_graph { + clip_graph_parakeet(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; + + ggml_tensor * parakeet_build_graph_conv(); + ggml_tensor * parakeet_build_graph_encoder(ggml_tensor * cur); +}; diff --git a/tools/mtmd/models/parakeet.cpp b/tools/mtmd/models/parakeet.cpp new file mode 100644 index 000000000000..3a98c7f81f2f --- /dev/null +++ b/tools/mtmd/models/parakeet.cpp @@ -0,0 +1,329 @@ +#include "models.h" + +ggml_cgraph * clip_graph_parakeet::build() { + // Build convolution graph + ggml_tensor * cur = parakeet_build_graph_conv(); + ggml_build_forward_expand(gf, cur); + + // Build encoder graph + cur = parakeet_build_graph_encoder(cur); + + cur = ggml_rms_norm(ctx0, cur, 1e-6); + cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); + cb(cur, "sound_projection.norm", -1); + + cur = build_ffn(cur, model.mm_0_w, model.mm_0_b, nullptr, nullptr, model.mm_1_w, model.mm_1_b, FFN_RELU_SQR, -1); + cb(cur, "projected", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; +} + +ggml_tensor * clip_graph_parakeet::parakeet_build_graph_conv() { + ggml_tensor * inp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.ny, img.nx, 1); + ggml_set_name(inp, "inp_raw"); + ggml_set_input(inp); + + // [freq, time, channels, batch] + ggml_tensor * cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], inp, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]); + cb(cur, "pre_conv_0", -1); + ggml_set_output(cur); + + cur = ggml_relu(ctx0, cur); + cb(cur, "pre_conv_0_relu", -1); + + // [freq, time, channels, batch] + cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]); + cb(cur, "pre_conv_2", -1); + + // [freq, time, channels, batch] + cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]); + cb(cur, "pre_conv_3", -1); + + cur = ggml_relu(ctx0, cur); + cb(cur, "pre_conv_3_relu", -1); + + // [freq, time, channels, batch] + cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1); + cb(cur, "pre_conv_5_direct", -1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]); + cb(cur, "pre_conv_5", -1); + + // [freq, time, channels, batch] + cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]); + cb(cur, "pre_conv_6", -1); + + cur = ggml_relu(ctx0, cur); + cb(cur, "pre_conv_6_relu", -1); + + // [freq, time, chan] + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + // [freq, chan, time] + cur = ggml_cont(ctx0, cur); + + const int n_freq = cur->ne[0]; + const int n_chan = cur->ne[1]; + const int n_frames = cur->ne[2]; + + // [freq, time, chan, batch] -> [(freq * chan), time] + cur = ggml_reshape_2d(ctx0, cur, n_freq * n_chan, n_frames); + + cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur); + cur = ggml_add(ctx0, cur, model.pre_encode_out_b); + + ggml_set_name(cur, "pre_enc_out"); + ggml_set_output(cur); + + return cur; +} + +ggml_tensor * clip_graph_parakeet::parakeet_build_graph_encoder(ggml_tensor * cur) { + const auto & hparams = model.hparams; + const int n_layer = hparams.n_layer; + const int n_state = hparams.n_embd; + const float fc_factor = 0.5f; + + // [time_frames, time_frames, 1, 1]] + struct ggml_tensor * attn_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, cur->ne[1], cur->ne[1]); + ggml_set_name(attn_mask, "attn_mask"); + ggml_set_input(attn_mask); + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers[il]; + // FFN1 + { + struct ggml_tensor * residual = cur; + //ggml_format_name(cur, "enc_%d_res", il); + + // norm + cur = ggml_norm(ctx0, cur, 1e-5); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ff_norm_w), layer.ff_norm_b); + ggml_format_name(cur, "enc_%d_ffn_norm_1", il); + + // ffn_1 + cur = ggml_mul_mat(ctx0, layer.ff_up_w, cur); + cur = ggml_silu(ctx0, cur); + ggml_format_name(cur, "enc_%d_silu", il); + + cur = ggml_mul_mat(ctx0, layer.ff_down_w, cur); + ggml_format_name(cur, "enc_%d_ffn_1", il); + + cur = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); + ggml_format_name(cur, "enc_%d_res_ffn", il); + } + + // self attention block using relative positional encoding from model.position_embedding. + { + // [feat, time_frames, 1, 1] + struct ggml_tensor * residual = cur; + + cur = ggml_norm(ctx0, cur, hparams.eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b); + ggml_format_name(cur, "enc_%d_attn_norm", il); + + const int n_head = hparams.n_head; + const int d_head = n_state / n_head; + const int n_time = cur->ne[1]; + + // [feat, time_frames, 1, 1] + struct ggml_tensor * Q_cur = ggml_mul_mat(ctx0, layer.q_w, cur); + struct ggml_tensor * K_cur = ggml_mul_mat(ctx0, layer.k_w, cur); + struct ggml_tensor * V_cur = ggml_mul_mat(ctx0, layer.v_w, cur); + + // [d_head, n_heads, time_frames, 1] + Q_cur = ggml_reshape_3d(ctx0, Q_cur, d_head, n_head, n_time); + K_cur = ggml_reshape_3d(ctx0, K_cur, d_head, n_head, n_time); + V_cur = ggml_reshape_3d(ctx0, V_cur, d_head, n_head, n_time); + + const int input_len = cur->ne[1]; + const int center_pos = model.position_embeddings->ne[1] / 2 + 1; + const int start_pos = center_pos - input_len; + const int window_size = 2 * input_len - 1; + + const size_t offset = start_pos * model.position_embeddings->nb[1]; + + // [feat, window_size] + struct ggml_tensor * pos_emb = ggml_view_2d(ctx0, model.position_embeddings, + n_state, window_size, + model.position_embeddings->nb[1], offset); + ggml_format_name(pos_emb, "enc_%d_attn_pos_emb", il); + + struct ggml_tensor * pos = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb); + ggml_format_name(pos, "enc_%d_attn_pos", il); + + // Add the content bias to Q. + // [feat, head, time_frames, batch] + struct ggml_tensor * Q_u = ggml_add(ctx0, Q_cur, layer.pos_bias_u); + ggml_format_name(Q_u, "enc_%d_attn_q_u", il); + + // [feat, time_frames, head, 1] + struct ggml_tensor * K_prep = ggml_permute(ctx0, K_cur, 0, 2, 1, 3); + // [feat, time_frames, head, 1] + struct ggml_tensor * Q_prep = ggml_permute(ctx0, Q_u, 0, 2, 1, 3); + // [feat, feat, head, 1] + struct ggml_tensor * content_scores = ggml_mul_mat(ctx0, K_prep, Q_prep); + ggml_format_name(content_scores, "enc_%d_attn_content_scores", il); + + // Add the position bias to Q. + // [feat, head, time_frames, batch] + struct ggml_tensor * Q_v = ggml_add(ctx0, Q_cur, layer.pos_bias_v); + ggml_format_name(Q_v, "enc_%d_attn_q_v", il); + + // [feat, window_size, 1, 1] and we are doing multi-head attention so + // we need to split this into heads. + // [feat, head, window_size, 1] + pos = ggml_reshape_3d(ctx0, pos, d_head, n_head, pos_emb->ne[1]); + + // [feat, window_size, head, 1] + pos = ggml_permute(ctx0, pos, 0, 2, 1, 3); + pos = ggml_cont(ctx0, pos); + ggml_format_name(pos, "enc_%d_attn_pos_perm", il); + // [feat, time, head, 1] + Q_v = ggml_permute(ctx0, Q_v, 0, 2, 1, 3); + Q_v = ggml_cont(ctx0, Q_v); + ggml_format_name(Q_v, "enc_%d_attn_q_v_perm", il); + + // [window_size, time_frames, head, 1] + struct ggml_tensor * rel_pos_scores = ggml_mul_mat(ctx0, pos, Q_v); + ggml_format_name(rel_pos_scores, "enc_%d_attn_rel_pos", il); + + // Relative positional shift + { + + const auto pos_window = rel_pos_scores->ne[0]; + const auto n_frame = rel_pos_scores->ne[1]; + const auto n_head = rel_pos_scores->ne[2]; + + // [feat_padded, window_size, head, 1] + rel_pos_scores = ggml_pad(ctx0, rel_pos_scores, 1, 0, 0, 0); + rel_pos_scores = ggml_roll(ctx0, rel_pos_scores, 1, 0, 0, 0); + + rel_pos_scores = ggml_reshape_3d(ctx0, rel_pos_scores, n_frame, pos_window + 1, n_head); + rel_pos_scores = ggml_cont(ctx0, rel_pos_scores); + ggml_format_name(rel_pos_scores, "enc_%d_attn_rel_pos_reshaped", il); + + int center = pos_window / 2; + size_t offset = rel_pos_scores->nb[0] * (center+1); + + rel_pos_scores = ggml_view_3d(ctx0, rel_pos_scores, + n_frame, pos_window, n_head, + (pos_window) * 4, + rel_pos_scores->nb[2], + offset); + + rel_pos_scores = ggml_cont(ctx0, rel_pos_scores); + ggml_format_name(rel_pos_scores, "enc_%d_attn_rel_pos_shifted", il); + + rel_pos_scores = ggml_view_3d(ctx0, rel_pos_scores, + content_scores->ne[0], + content_scores->ne[1], + rel_pos_scores->ne[2], + rel_pos_scores->nb[1], + rel_pos_scores->nb[2], + 0); + rel_pos_scores = ggml_cont(ctx0, rel_pos_scores); + ggml_format_name(rel_pos_scores, "enc_%d_attn_rel_pos_shifted_view", il); + } + + struct ggml_tensor * attn_scores = ggml_add(ctx0, content_scores, rel_pos_scores); + attn_scores = ggml_cont(ctx0, attn_scores); + ggml_format_name(attn_scores, "enc_%d_attn_scores", il); + attn_scores = ggml_scale(ctx0, attn_scores, 1.0f / std::sqrt(d_head)); + attn_scores = ggml_add(ctx0, attn_scores, attn_mask); + ggml_format_name(attn_scores, "enc_%d_attn_scores_scaled", il); + + struct ggml_tensor * probs = ggml_soft_max(ctx0, attn_scores); + ggml_format_name(probs, "enc_%d_attn_probs", il); + + V_cur = ggml_cont(ctx0, ggml_permute(ctx0, V_cur, 1, 2, 0, 3)); + ggml_format_name(V_cur, "enc_%d_attn_v_cur", il); + cur = ggml_mul_mat(ctx0, probs, V_cur); + ggml_format_name(cur, "enc_%d_attn_inp", il); + + cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); + cur = ggml_cont_2d(ctx0, cur, n_state, n_time); + cur = ggml_mul_mat(ctx0, layer.o_w, cur); + ggml_format_name(cur, "enc_%d_attn_out", il); + + cur = ggml_add(ctx0, residual, cur); + ggml_format_name(cur, "enc_%d_attn_res", il); + } + + // Convolution + { + struct ggml_tensor * residual = cur; + ggml_format_name(cur, "enc_%d_residual_conv", il); + + cur = ggml_norm(ctx0, cur, hparams.eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.norm_conv_w), layer.norm_conv_b); + ggml_format_name(cur, "enc_%d_norm_conv", il); + + // pointwise 1d convolution: + cur = ggml_mul_mat(ctx0, layer.conv_pw1_w, cur); + ggml_format_name(cur, "enc_%d_conv_pw1", il); + + { + int64_t d = cur->ne[0] / 2; + struct ggml_tensor * signal = ggml_view_2d(ctx0, cur, d, cur->ne[1], cur->nb[1], 0); + struct ggml_tensor * gate = ggml_view_2d(ctx0, cur, d, cur->ne[1], cur->nb[1], d * cur->nb[0]); + + cur = ggml_mul(ctx0, signal, ggml_sigmoid(ctx0, gate)); + ggml_format_name(cur, "enc_%d_conv_glu", il); + } + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + // use ggml_ssm_conv for f32 precision + cur = ggml_pad(ctx0, cur, 4, 0, 0, 0); + cur = ggml_roll(ctx0, cur, 4, 0, 0, 0); + cur = ggml_pad(ctx0, cur, 4, 0, 0, 0); + ggml_format_name(cur, "enc_%d_conv_dw_pad", il); + + cur = ggml_ssm_conv(ctx0, cur, layer.conv_dw_w); + ggml_format_name(cur, "enc_%d_conv_1d_dw", il); + + cur = ggml_sub(ctx0, cur, layer.conv_norm_mean); + struct ggml_tensor * std = ggml_sqrt(ctx0, layer.conv_norm_var); + cur = ggml_div(ctx0, cur, std); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.conv_norm_w), layer.conv_norm_b); + ggml_format_name(cur, "enc_%d_conv_bn", il); + + cur = ggml_silu(ctx0, cur); + ggml_format_name(cur, "enc_%d_conv_silu", il); + + cur = ggml_mul_mat(ctx0, layer.conv_pw2_w, cur); + ggml_format_name(cur, "enc_%d_conv_pw2", il); + + cur = ggml_add(ctx0, residual, cur); + ggml_format_name(cur, "enc_%d_conv_res", il); + } + + // FFN2 + { + struct ggml_tensor * residual = cur; + cur = ggml_norm(ctx0, cur, hparams.eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ff_norm_1_w), layer.ff_norm_1_b); + ggml_format_name(cur, "enc_%d_ffn_norm_2", il); + + cur = ggml_mul_mat(ctx0, layer.ff_up_1_w, cur); + cur = ggml_silu(ctx0, cur); + cur = ggml_mul_mat(ctx0, layer.ff_down_1_w, cur); + cur = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, 0.5)); + ggml_format_name(cur, "enc_%d_ffn_res", il); + } + + cur = ggml_norm(ctx0, cur, hparams.eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b); + } + + cb(cur, "encoder_out", -1); + + ggml_build_forward_expand(gf, cur); + + return cur; +} diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 38a8ce4f4a69..cb1374e8f859 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -731,6 +731,232 @@ bool mtmd_audio_preprocessor_gemma4a::preprocess(const float * s return true; } +// +// mtmd_audio_preprocessor_parakeet implementation +// + +static void log_mel_spectrogram_parakeet_worker_thread( + int ith, + const float * window_func, + int window_size, + const std::vector & samples, + int n_samples, + int frame_size, + int frame_step, + int n_threads, + const filter_params & params, + const mtmd_audio_cache & cache, + mtmd_audio_mel & mel) { + std::vector fft_in(frame_size * 2, 0.0); + std::vector fft_out(frame_size * 2 * 2 * 2); + + int n_fb = params.n_fft_bins; + int i = ith; + + GGML_ASSERT(n_fb == 1 + (frame_size / 2)); + + const double eps = 5.960464477539063e-08; + + for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) { + const int offset = i * frame_step; + const int window_pad_left = (frame_size - window_size) / 2; + + // Zero-pad left. + std::fill(fft_in.begin(), fft_in.begin() + window_pad_left, 0.0f); + + // Apply windowed samples in the center. + const int n_to_process = std::min({window_size, n_samples - offset}); + for (int j = 0; j < n_to_process; j++) { + fft_in[window_pad_left + j] = window_func[j] * samples[offset + window_pad_left + j]; + } + + // Zero-pad right. + std::fill(fft_in.begin() + window_pad_left + n_to_process, fft_in.begin() + frame_size, 0.0f); + + // FFT. + fft(cache, fft_in.data(), frame_size, fft_out.data()); + + // Calculate modulus^2 of complex numbers. + for (int j = 0; j < n_fb; j++) { + fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); + } + + // mel spectrogram. + for (int j = 0; j < mel.n_mel; j++) { + double sum = 0.0; + int k = 0; + for (k = 0; k < n_fb - 3; k += 4) { + sum += + fft_out[k + 0] * cache.filters.data[j * n_fb + k + 0] + + fft_out[k + 1] * cache.filters.data[j * n_fb + k + 1] + + fft_out[k + 2] * cache.filters.data[j * n_fb + k + 2] + + fft_out[k + 3] * cache.filters.data[j * n_fb + k + 3]; + } + for (; k < n_fb; k++) { + sum += fft_out[k] * cache.filters.data[j * n_fb + k]; + } + mel.data[i * mel.n_mel + j] = std::log(sum + eps); + } + } + + // Otherwise fft_out are all zero. + const double empty_sum = std::log(eps); + for (; i < mel.n_len; i += n_threads) { + for (int j = 0; j < mel.n_mel; j++) { + mel.data[i * mel.n_mel + j] = empty_sum; + } + } +} + +void mtmd_audio_preprocessor_parakeet::initialize() { + cache.fill_sin_cos_table(hparams.audio_n_fft); + + // Use models mel filter bank tensor. + ggml_tensor * mel_filters = clip_get_mel_filter_tensor(ctx); + GGML_ASSERT(mel_filters); + + cache.filters.n_mel = mel_filters->ne[1]; + cache.filters.n_fft = mel_filters->ne[0]; + cache.filters.data.resize(ggml_nelements(mel_filters)); + ggml_backend_tensor_get(mel_filters, cache.filters.data.data(), 0, ggml_nbytes(mel_filters)); + + // Use models hann window tensor. + ggml_tensor * window = clip_get_window_tensor(ctx); + GGML_ASSERT(window); + cache.hann_window.resize(ggml_nelements(window)); + ggml_backend_tensor_get(window, cache.hann_window.data(), 0, ggml_nbytes(window)); +} + +bool mtmd_audio_preprocessor_parakeet::preprocess(const float * samples, + size_t n_samples_in, + std::vector & output) { + if (n_samples_in == 0) { + return false; + } + + filter_params params; + params.n_mel = hparams.n_mel_bins; + params.n_fft_bins = 1 + (hparams.audio_n_fft / 2); + params.hann_window_size = hparams.audio_window_len; + params.hop_length = hparams.audio_hop_len; + params.sample_rate = hparams.audio_sample_rate; + + GGML_ASSERT(!cache.sin_vals.empty()); + GGML_ASSERT(!cache.cos_vals.empty()); + GGML_ASSERT(!cache.filters.data.empty()); + + const float * window_func = cache.hann_window.data(); + const int window_size = params.hann_window_size; + const int frame_size = (params.n_fft_bins - 1) * 2; + const int frame_step = params.hop_length; + + // Apply preemphasis filter (high-pass): x[i] = x[i] - 0.97 * x[i-1] + std::vector samples_preprocessed(samples, samples + n_samples_in); + { + const float preemph = 0.97f; + for (int i = n_samples_in - 1; i > 0; i--) { + samples_preprocessed[i] = samples_preprocessed[i] - preemph * samples_preprocessed[i - 1]; + } + } + + // Parakeet uses centered constant padding + const int pad = frame_size / 2; + std::vector samples_padded(n_samples_in + 2 * pad); + std::fill(samples_padded.begin(), samples_padded.begin() + pad, 0.0f); + std::fill(samples_padded.begin() + pad + n_samples_in, samples_padded.end(), 0.0f); + std::copy(samples_preprocessed.begin(), samples_preprocessed.end(), samples_padded.begin() + pad); + + mtmd_audio_mel out_full; + out_full.n_mel = params.n_mel; + out_full.n_len = (samples_padded.size() - frame_size) / frame_step + 1; + out_full.n_len_org = out_full.n_len; + out_full.data.resize(out_full.n_mel * out_full.n_len); + + const int n_threads = 4; + + if (n_threads == 1) { + log_mel_spectrogram_parakeet_worker_thread(0, + window_func, + window_size, + samples_padded, + samples_padded.size(), + frame_size, + frame_step, + 1, + params, + cache, + out_full); + } else { + std::vector workers(n_threads - 1); + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw] = std::thread( + log_mel_spectrogram_parakeet_worker_thread, iw + 1, + window_func, + window_size, + std::cref(samples_padded), + samples_padded.size(), + frame_size, + frame_step, + n_threads, + std::cref(params), + std::cref(cache), + std::ref(out_full) + ); + } + + log_mel_spectrogram_parakeet_worker_thread(0, + window_func, + window_size, + samples_padded, + samples_padded.size(), + frame_size, + frame_step, + n_threads, + params, + cache, + out_full); + + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw].join(); + } + } + + // Per-feature normalization (only on valid frames) + { + const double eps = 1e-5; + int valid_frames = n_samples_in / frame_step; + + for (int j = 0; j < out_full.n_mel; j++) { + double sum = 0.0; + double sq_diff_sum = 0.0; + + // Calculate Mean ONLY on valid audio frames + for (int i = 0; i < valid_frames; i++) { + sum += (double)out_full.data[i * out_full.n_mel + j]; + } + double mean = sum / valid_frames; + + // Calculate Variance ONLY on valid audio frames + for (int i = 0; i < valid_frames; i++) { + double diff = (double)out_full.data[i * out_full.n_mel + j] - mean; + sq_diff_sum += diff * diff; + } + + double std_dev = std::sqrt(sq_diff_sum / (valid_frames - 1.0)); + double denominator = std_dev + eps; + + // Apply to ALL frames (including the padded ones) + for (int i = 0; i < out_full.n_len; i++) { + out_full.data[i * out_full.n_mel + j] = (float)((out_full.data[i * out_full.n_mel + j] - mean) / denominator); + } + } + } + + output.push_back(std::move(out_full)); + return true; +} + + // // mtmd_audio_streaming_istft implementation // diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index efaa14f924fc..a3c9d1fc935c 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -87,6 +87,16 @@ struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor { mtmd_audio_cache cache; }; +struct mtmd_audio_preprocessor_parakeet : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_parakeet(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx), ctx(ctx) { } + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; + + private: + mtmd_audio_cache cache; + const clip_ctx * ctx; +}; + // // streaming ISTFT - converts spectrogram frames back to audio one frame at a time // diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 59907786786d..a838d1ca910e 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -538,6 +538,10 @@ struct mtmd_context { aud_end = ""; audio_preproc = std::make_unique(ctx_a); } break; + case PROJECTOR_TYPE_PARAKEET: + { + audio_preproc = std::make_unique(ctx_a); + } break; default: throw std::runtime_error(string_format("%s: unexpected audio projector type %d\n", __func__, proj)); } From e557406b1535466e8a63533e887e3c6b6c8030ca Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 30 Apr 2026 12:46:56 +0200 Subject: [PATCH 02/11] mtmd : generate rel pos tensor in graph instead of in conversion [no ci] This commit removes the generation of the relative positional tensor in the model conversion script and instead computes it in the encoder graph. This is only done for the window of positions required for the current audio sample. --- convert_hf_to_gguf.py | 26 -------------------------- tools/mtmd/clip.cpp | 27 +++++++++++++++++++++++++++ tools/mtmd/models/parakeet.cpp | 34 +++++++++++++++++++++------------- 3 files changed, 48 insertions(+), 39 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 667cc7ab345b..4b6b303addf7 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4491,9 +4491,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".position_embd." in new_name or "pos_embed" in new_name: - return gguf.GGMLQuantizationType.F32 - if "sound_encoder" in name or new_name.startswith("mm.a."): if "bias" in new_name or "norm" in new_name: return gguf.GGMLQuantizationType.F32 @@ -4502,29 +4499,6 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - sound_config = self.global_config.get("sound_config") - if sound_config: - # Generate relative position embeddings. - d_model = sound_config.get("hidden_size") - max_len = 5000 * 2 - 1 - - pe = torch.zeros(max_len, d_model, dtype=torch.float32) - log_10000 = math.log(10000.0) - - for idx in range(max_len): - position = float((max_len // 2) - idx) - - for i in range(0, d_model, 2): - div_term = math.exp(-(float(i) * log_10000 / float(d_model))) - angle = position * div_term - - pe[idx, i] = math.sin(angle) - if i + 1 < d_model: - pe[idx, i + 1] = math.cos(angle) - - yield ("a.position_embd.weight", pe) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if "input_conditioner" in name: return diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 971b69b374e9..f0d9dd22fc0e 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3799,6 +3799,33 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } } ggml_backend_tensor_set(attn_mask, mask_data.data(), 0, mask_data.size() * sizeof(float)); + + // Generate rotation frequencies for relative positional encoding. + { + struct ggml_tensor * pos_freqs_t = ggml_graph_get_tensor(gf, "pos_freqs"); + const int d_half = pos_freqs_t->ne[0]; + const int n_state = hparams.n_embd; + const float log_10000 = logf(10000.0f); + std::vector freqs(d_half); + for (int k = 0; k < d_half; ++k) { + freqs[k] = expf(-(float(k * 2) * log_10000 / float(n_state))); + } + ggml_backend_tensor_set(pos_freqs_t, freqs.data(), 0, freqs.size() * sizeof(float)); + } + + // Generate relative positional distance values which scaled by + // the frequency to produce the angles for sin/cos. + { + struct ggml_tensor * rel_pos_t = ggml_graph_get_tensor(gf, "rel_positions"); + const int window_size = rel_pos_t->ne[1]; + const int n_time = (window_size + 1) / 2; + std::vector pos(window_size); + for (int t = 0; t < window_size; ++t) { + // The range of the values is high to low which the original model has. + pos[t] = float(n_time - 1 - t); + } + ggml_backend_tensor_set(rel_pos_t, pos.data(), 0, pos.size() * sizeof(float)); + } } break; default: GGML_ABORT("Unknown projector type"); diff --git a/tools/mtmd/models/parakeet.cpp b/tools/mtmd/models/parakeet.cpp index 3a98c7f81f2f..a77371eb96b2 100644 --- a/tools/mtmd/models/parakeet.cpp +++ b/tools/mtmd/models/parakeet.cpp @@ -93,6 +93,26 @@ ggml_tensor * clip_graph_parakeet::parakeet_build_graph_encoder(ggml_tensor * cu ggml_set_name(attn_mask, "attn_mask"); ggml_set_input(attn_mask); + const int n_time = cur->ne[1]; + const int window_size = 2 * n_time - 1; + const int d_half = n_state / 2; + + struct ggml_tensor * pos_freqs = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, d_half); + ggml_set_name(pos_freqs, "pos_freqs"); + ggml_set_input(pos_freqs); + + struct ggml_tensor * rel_positions = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, window_size); + ggml_set_name(rel_positions, "rel_positions"); + ggml_set_input(rel_positions); + + struct ggml_tensor * freqs = ggml_repeat_4d(ctx0, pos_freqs, d_half, window_size, 1, 1); + struct ggml_tensor * theta = ggml_mul(ctx0, freqs, rel_positions); + + struct ggml_tensor * sin = ggml_reshape_3d(ctx0, ggml_sin(ctx0, theta), 1, d_half, window_size); + struct ggml_tensor * cos = ggml_reshape_3d(ctx0, ggml_cos(ctx0, theta), 1, d_half, window_size); + struct ggml_tensor * pos_emb = ggml_reshape_2d(ctx0, ggml_cont(ctx0, ggml_concat(ctx0, sin, cos, 0)), n_state, window_size); + ggml_set_name(pos_emb, "pos_emb"); + for (int il = 0; il < n_layer; ++il) { const auto & layer = model.layers[il]; // FFN1 @@ -140,19 +160,7 @@ ggml_tensor * clip_graph_parakeet::parakeet_build_graph_encoder(ggml_tensor * cu K_cur = ggml_reshape_3d(ctx0, K_cur, d_head, n_head, n_time); V_cur = ggml_reshape_3d(ctx0, V_cur, d_head, n_head, n_time); - const int input_len = cur->ne[1]; - const int center_pos = model.position_embeddings->ne[1] / 2 + 1; - const int start_pos = center_pos - input_len; - const int window_size = 2 * input_len - 1; - - const size_t offset = start_pos * model.position_embeddings->nb[1]; - - // [feat, window_size] - struct ggml_tensor * pos_emb = ggml_view_2d(ctx0, model.position_embeddings, - n_state, window_size, - model.position_embeddings->nb[1], offset); - ggml_format_name(pos_emb, "enc_%d_attn_pos_emb", il); - + // [n_state, window_size] struct ggml_tensor * pos = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb); ggml_format_name(pos, "enc_%d_attn_pos", il); From a9929d4258b0b496680cbd35fa74ceb2810c488c Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 30 Apr 2026 14:47:52 +0200 Subject: [PATCH 03/11] mtmd : add clip_get_model to clip API [no ci] This commit adds a function to get access to the clip_model. It also removes the two functions clip_get_mel_filter_tensor, and clip_get_window_tensor(const struct clip_ctx * ctx) which can now use clip_get_model to access the model tensors that it needs. --- tools/mtmd/clip.cpp | 8 ++------ tools/mtmd/clip.h | 4 +--- tools/mtmd/mtmd-audio.cpp | 5 +++-- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f0d9dd22fc0e..b9ab66e57ad7 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -4060,12 +4060,8 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { return &ctx->model.hparams; } -struct ggml_tensor * clip_get_mel_filter_tensor(const struct clip_ctx * ctx) { - return ctx->model.mel_filters; -} - -struct ggml_tensor * clip_get_window_tensor(const struct clip_ctx * ctx) { - return ctx->model.window; +const struct clip_model & clip_get_model(const struct clip_ctx * ctx) { + return ctx->model; } // diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index e525a5224be6..34ea60a41788 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -117,6 +117,4 @@ bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); bool clip_has_whisper_encoder(const struct clip_ctx * ctx); -struct ggml_tensor * clip_get_mel_filter_tensor(const struct clip_ctx * ctx); - -struct ggml_tensor * clip_get_window_tensor(const struct clip_ctx * ctx); +const struct clip_model & clip_get_model(const struct clip_ctx * ctx); diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index cb1374e8f859..6595e3e921ba 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -812,7 +812,8 @@ void mtmd_audio_preprocessor_parakeet::initialize() { cache.fill_sin_cos_table(hparams.audio_n_fft); // Use models mel filter bank tensor. - ggml_tensor * mel_filters = clip_get_mel_filter_tensor(ctx); + const clip_model & model = clip_get_model(ctx); + ggml_tensor * mel_filters = model.mel_filters; GGML_ASSERT(mel_filters); cache.filters.n_mel = mel_filters->ne[1]; @@ -821,7 +822,7 @@ void mtmd_audio_preprocessor_parakeet::initialize() { ggml_backend_tensor_get(mel_filters, cache.filters.data.data(), 0, ggml_nbytes(mel_filters)); // Use models hann window tensor. - ggml_tensor * window = clip_get_window_tensor(ctx); + ggml_tensor * window = model.window; GGML_ASSERT(window); cache.hann_window.resize(ggml_nelements(window)); ggml_backend_tensor_get(window, cache.hann_window.data(), 0, ggml_nbytes(window)); From 8e279f49fcbc04325485f48b7cfc36c49fd5c34c Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 30 Apr 2026 15:35:39 +0200 Subject: [PATCH 04/11] mtmd : read mel_filters and window into hparams --- tools/mtmd/clip-model.h | 6 ++++-- tools/mtmd/clip.cpp | 26 +++++++++++++++++++------- tools/mtmd/clip.h | 2 -- tools/mtmd/mtmd-audio.cpp | 22 +++++++--------------- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 62c40be8d45e..e142b7f74950 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -101,6 +101,10 @@ struct clip_hparams { int32_t audio_window_len = -1; int32_t audio_hop_len = -1; + // parakeet + std::vector mel_filters; + std::vector window; + // legacy bool has_llava_projector = false; int minicpmv_version = 0; @@ -492,8 +496,6 @@ struct clip_model { // Parakeet ggml_tensor * mm_norm_w = nullptr; - ggml_tensor * mel_filters = nullptr; - ggml_tensor * window = nullptr; int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index b9ab66e57ad7..37cdae1de00e 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2430,9 +2430,25 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_PARAKEET: { - // Preprocessing tensors - model.mel_filters = get_tensor(TN_MEL_FILTERS); - model.window = get_tensor(TN_WINDOW); + auto get_vector = [&](const std::string & name) { + std::vector result; + auto it = tensor_offset.find(name); + if (it == tensor_offset.end()) { + return result; + } + + int idx = gguf_find_tensor(ctx_gguf.get(), name.c_str()); + GGML_ASSERT(idx >= 0); + size_t n_bytes = gguf_get_tensor_size(ctx_gguf.get(), idx); + size_t n_elems = n_bytes / sizeof(float); + result.resize(n_elems); + fin.seekg(it->second, std::ios::beg); + fin.read(reinterpret_cast(result.data()), n_bytes); + return result; + }; + + hparams.mel_filters = get_vector(TN_MEL_FILTERS); + hparams.window = get_vector(TN_WINDOW); // Subsampling layers (conv1d) for (int i : {0, 2, 3, 5, 6}) { @@ -4060,10 +4076,6 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { return &ctx->model.hparams; } -const struct clip_model & clip_get_model(const struct clip_ctx * ctx) { - return ctx->model; -} - // // API for debugging // diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 34ea60a41788..a859b38658d3 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -116,5 +116,3 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); bool clip_has_whisper_encoder(const struct clip_ctx * ctx); - -const struct clip_model & clip_get_model(const struct clip_ctx * ctx); diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 6595e3e921ba..b4f245da4296 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -811,21 +811,13 @@ static void log_mel_spectrogram_parakeet_worker_thread( void mtmd_audio_preprocessor_parakeet::initialize() { cache.fill_sin_cos_table(hparams.audio_n_fft); - // Use models mel filter bank tensor. - const clip_model & model = clip_get_model(ctx); - ggml_tensor * mel_filters = model.mel_filters; - GGML_ASSERT(mel_filters); - - cache.filters.n_mel = mel_filters->ne[1]; - cache.filters.n_fft = mel_filters->ne[0]; - cache.filters.data.resize(ggml_nelements(mel_filters)); - ggml_backend_tensor_get(mel_filters, cache.filters.data.data(), 0, ggml_nbytes(mel_filters)); - - // Use models hann window tensor. - ggml_tensor * window = model.window; - GGML_ASSERT(window); - cache.hann_window.resize(ggml_nelements(window)); - ggml_backend_tensor_get(window, cache.hann_window.data(), 0, ggml_nbytes(window)); + GGML_ASSERT(!hparams.mel_filters.empty()); + cache.filters.n_mel = hparams.n_mel_bins; + cache.filters.n_fft = hparams.audio_n_fft / 2 + 1; + cache.filters.data = hparams.mel_filters; + + GGML_ASSERT(!hparams.window.empty()); + cache.hann_window = hparams.window; } bool mtmd_audio_preprocessor_parakeet::preprocess(const float * samples, From ffd1b997cd65a1869ab46a1310a9a05d7ddfd2d4 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 1 May 2026 10:16:09 +0200 Subject: [PATCH 05/11] mtmd : use set_input_f32 lambda [no ci] --- tools/mtmd/clip.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 37cdae1de00e..e97924d6a85a 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3814,33 +3814,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima mask_data[q * n_k + k] = (is_padding) ? mask_value : 0.0f; } } - ggml_backend_tensor_set(attn_mask, mask_data.data(), 0, mask_data.size() * sizeof(float)); + set_input_f32(attn_mask->name, mask_data); // Generate rotation frequencies for relative positional encoding. { - struct ggml_tensor * pos_freqs_t = ggml_graph_get_tensor(gf, "pos_freqs"); - const int d_half = pos_freqs_t->ne[0]; const int n_state = hparams.n_embd; + const int d_half = n_state / 2; const float log_10000 = logf(10000.0f); std::vector freqs(d_half); for (int k = 0; k < d_half; ++k) { freqs[k] = expf(-(float(k * 2) * log_10000 / float(n_state))); } - ggml_backend_tensor_set(pos_freqs_t, freqs.data(), 0, freqs.size() * sizeof(float)); + set_input_f32("pos_freqs", freqs); } // Generate relative positional distance values which scaled by // the frequency to produce the angles for sin/cos. { - struct ggml_tensor * rel_pos_t = ggml_graph_get_tensor(gf, "rel_positions"); - const int window_size = rel_pos_t->ne[1]; + // window_size is only known after graph construction since it depends on + // n_time from the conv output, so we read it back from the graph tensor. + struct ggml_tensor * rel_pos = ggml_graph_get_tensor(gf, "rel_positions"); + const int window_size = rel_pos->ne[1]; const int n_time = (window_size + 1) / 2; std::vector pos(window_size); for (int t = 0; t < window_size; ++t) { // The range of the values is high to low which the original model has. pos[t] = float(n_time - 1 - t); } - ggml_backend_tensor_set(rel_pos_t, pos.data(), 0, pos.size() * sizeof(float)); + set_input_f32(rel_pos->name, pos); } } break; default: From 8af100feead0159a8904ea30399134df11f6f85e Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 5 May 2026 12:01:12 +0200 Subject: [PATCH 06/11] mtmd : add better asserts for mel_filters and hann window [no ci] --- tools/mtmd/mtmd-audio.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index b4f245da4296..43c57e8eb1e9 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -811,12 +811,14 @@ static void log_mel_spectrogram_parakeet_worker_thread( void mtmd_audio_preprocessor_parakeet::initialize() { cache.fill_sin_cos_table(hparams.audio_n_fft); - GGML_ASSERT(!hparams.mel_filters.empty()); + const size_t n_fft = hparams.audio_n_fft / 2 + 1; + GGML_ASSERT(hparams.mel_filters.size() == (size_t)hparams.n_mel_bins * n_fft); cache.filters.n_mel = hparams.n_mel_bins; - cache.filters.n_fft = hparams.audio_n_fft / 2 + 1; + cache.filters.n_fft = n_fft; cache.filters.data = hparams.mel_filters; - GGML_ASSERT(!hparams.window.empty()); + GGML_ASSERT(hparams.window.size() == (size_t)hparams.audio_window_len); + GGML_ASSERT(hparams.window.size() <= hparams.audio_n_fft); cache.hann_window = hparams.window; } From 49658ba18274e20e2f454acd2624870b699fa3be Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 8 May 2026 12:15:42 +0200 Subject: [PATCH 07/11] mtmd : add missing size_t cast --- tools/mtmd/mtmd-audio.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 228943e580ec..cda8ebae0f84 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -925,7 +925,7 @@ void mtmd_audio_preprocessor_parakeet::initialize() { cache.filters.data = hparams.mel_filters; GGML_ASSERT(hparams.window.size() == (size_t)hparams.audio_window_len); - GGML_ASSERT(hparams.window.size() <= hparams.audio_n_fft); + GGML_ASSERT(hparams.window.size() <= (size_t) hparams.audio_n_fft); cache.hann_window = hparams.window; } From 9a8398e377d2171f26bad1da9e8b332e5bc57a26 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 8 May 2026 12:59:04 +0200 Subject: [PATCH 08/11] mtmd : change type of pad to size_t --- tools/mtmd/mtmd-audio.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index cda8ebae0f84..d107216731a6 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -962,7 +962,7 @@ bool mtmd_audio_preprocessor_parakeet::preprocess(const float * samples, } // Parakeet uses centered constant padding - const int pad = frame_size / 2; + const size_t pad = (size_t)(frame_size / 2); std::vector samples_padded(n_samples_in + 2 * pad); std::fill(samples_padded.begin(), samples_padded.begin() + pad, 0.0f); std::fill(samples_padded.begin() + pad + n_samples_in, samples_padded.end(), 0.0f); From 6ba52fc59699bad79d599c9d367f5a76962c42dc Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 8 May 2026 13:28:49 +0200 Subject: [PATCH 09/11] mtmd : zero initialize samples_padded --- tools/mtmd/mtmd-audio.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index d107216731a6..b5614fed9311 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -963,9 +963,7 @@ bool mtmd_audio_preprocessor_parakeet::preprocess(const float * samples, // Parakeet uses centered constant padding const size_t pad = (size_t)(frame_size / 2); - std::vector samples_padded(n_samples_in + 2 * pad); - std::fill(samples_padded.begin(), samples_padded.begin() + pad, 0.0f); - std::fill(samples_padded.begin() + pad + n_samples_in, samples_padded.end(), 0.0f); + std::vector samples_padded(n_samples_in + 2 * pad, 0.0f); std::copy(samples_preprocessed.begin(), samples_preprocessed.end(), samples_padded.begin() + pad); mtmd_audio_mel out_full; From 385b2d401d1181111203a78e0213c351f5bed497 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 8 May 2026 14:50:20 +0200 Subject: [PATCH 10/11] mtmd : remove unsued ctx member from parakeet preprocessor --- tools/mtmd/mtmd-audio.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index 4b951c9a68bd..6e9657c92b1d 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -97,13 +97,12 @@ struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor { }; struct mtmd_audio_preprocessor_parakeet : mtmd_audio_preprocessor { - mtmd_audio_preprocessor_parakeet(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx), ctx(ctx) { } + mtmd_audio_preprocessor_parakeet(clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) { } void initialize() override; bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; private: mtmd_audio_cache cache; - const clip_ctx * ctx; }; // From cef7ff7ca40db2e86833d59ccf5148a03fa14624 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 11 May 2026 08:59:47 +0200 Subject: [PATCH 11/11] mtmd : make log_mel_spectrogram_parakeet_worker_thread private static --- tools/mtmd/mtmd-audio.cpp | 18 +++++++++--------- tools/mtmd/mtmd-audio.h | 6 ++++++ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index b5614fed9311..1feabe00f574 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -842,7 +842,7 @@ bool mtmd_audio_preprocessor_gemma4a::preprocess(const float * s // mtmd_audio_preprocessor_parakeet implementation // -static void log_mel_spectrogram_parakeet_worker_thread( +void mtmd_audio_preprocessor_parakeet::worker_thread( int ith, const float * window_func, int window_size, @@ -851,13 +851,13 @@ static void log_mel_spectrogram_parakeet_worker_thread( int frame_size, int frame_step, int n_threads, - const filter_params & params, + int n_fft_bins, const mtmd_audio_cache & cache, mtmd_audio_mel & mel) { std::vector fft_in(frame_size * 2, 0.0); std::vector fft_out(frame_size * 2 * 2 * 2); - int n_fb = params.n_fft_bins; + int n_fb = n_fft_bins; int i = ith; GGML_ASSERT(n_fb == 1 + (frame_size / 2)); @@ -975,7 +975,7 @@ bool mtmd_audio_preprocessor_parakeet::preprocess(const float * samples, const int n_threads = 4; if (n_threads == 1) { - log_mel_spectrogram_parakeet_worker_thread(0, + worker_thread(0, window_func, window_size, samples_padded, @@ -983,14 +983,14 @@ bool mtmd_audio_preprocessor_parakeet::preprocess(const float * samples, frame_size, frame_step, 1, - params, + params.n_fft_bins, cache, out_full); } else { std::vector workers(n_threads - 1); for (int iw = 0; iw < n_threads - 1; ++iw) { workers[iw] = std::thread( - log_mel_spectrogram_parakeet_worker_thread, iw + 1, + worker_thread, iw + 1, window_func, window_size, std::cref(samples_padded), @@ -998,13 +998,13 @@ bool mtmd_audio_preprocessor_parakeet::preprocess(const float * samples, frame_size, frame_step, n_threads, - std::cref(params), + params.n_fft_bins, std::cref(cache), std::ref(out_full) ); } - log_mel_spectrogram_parakeet_worker_thread(0, + worker_thread(0, window_func, window_size, samples_padded, @@ -1012,7 +1012,7 @@ bool mtmd_audio_preprocessor_parakeet::preprocess(const float * samples, frame_size, frame_step, n_threads, - params, + params.n_fft_bins, cache, out_full); diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index 6e9657c92b1d..328ededaeff5 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -103,6 +103,12 @@ struct mtmd_audio_preprocessor_parakeet : mtmd_audio_preprocessor { private: mtmd_audio_cache cache; + + static void worker_thread(int ith, const float * window_func, int window_size, + const std::vector & samples, int n_samples, + int frame_size, int frame_step, int n_threads, + int n_fft_bins, + const mtmd_audio_cache & cache, mtmd_audio_mel & mel); }; //