Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
0024aa7
mtmd : add Nemotron 3 Nano Omni support (parakeet)
danbev Apr 29, 2026
e557406
mtmd : generate rel pos tensor in graph instead of in conversion [no ci]
danbev Apr 30, 2026
a9929d4
mtmd : add clip_get_model to clip API [no ci]
danbev Apr 30, 2026
8e279f4
mtmd : read mel_filters and window into hparams
danbev Apr 30, 2026
ffd1b99
mtmd : use set_input_f32 lambda [no ci]
danbev May 1, 2026
8af100f
mtmd : add better asserts for mel_filters and hann window [no ci]
danbev May 5, 2026
b5a35e0
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev May 5, 2026
7ed9294
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev May 8, 2026
49658ba
mtmd : add missing size_t cast
danbev May 8, 2026
9a8398e
mtmd : change type of pad to size_t
danbev May 8, 2026
6ba52fc
mtmd : zero initialize samples_padded
danbev May 8, 2026
385b2d4
mtmd : remove unsued ctx member from parakeet preprocessor
danbev May 8, 2026
cef7ff7
mtmd : make log_mel_spectrogram_parakeet_worker_thread private static
danbev May 11, 2026
681a199
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev May 12, 2026
44cb51f
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev May 15, 2026
0cd9e16
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev May 16, 2026
78e28f4
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev May 28, 2026
96b1326
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev Jun 1, 2026
656437b
Merge remote-tracking branch 'upstream/master' into nemotron-3-omni-m…
danbev Jun 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 49 additions & 6 deletions conversion/nemotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,40 @@ def get_vision_config(self) -> dict[str, Any] | None:
}
return vision_config

def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config.get("sound_config")

def set_gguf_parameters(self):
if "image_mean" not in self.preprocessor_config:
self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
if "image_std" not in self.preprocessor_config:
self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225]

if self.hparams_audio is not None:
self.has_vision_encoder = True
self.has_audio_encoder = True
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
self.gguf_writer.add_audio_subsampling_factor(self.hparams_audio["subsampling_factor"])
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.PARAKEET)
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL)
else:
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL)

super().set_gguf_parameters()
hparams = self.global_config
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL)
self.gguf_writer.add_vision_attention_layernorm_eps(1e-6)
self.gguf_writer.add_vision_use_gelu(True)
downsample_ratio = hparams.get("downsample_ratio", 0.5)
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))

def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".position_embd." in new_name or "pos_embed" in new_name:
return gguf.GGMLQuantizationType.F32
if "sound_encoder" in name or new_name.startswith("mm.a."):
if "bias" in new_name or "norm" in new_name:
return gguf.GGMLQuantizationType.F32
if "conv" in new_name and "weight" in new_name:
return gguf.GGMLQuantizationType.F32

return super().tensor_force_quant(name, new_name, bid, n_dims)

@classmethod
Expand All @@ -65,18 +82,25 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
if "input_conditioner" in name:
return None

if "language_model" in name:
return None

# mtmd does not support video yet so skip tensors related to video.
if "radio_model.model.patch_generator.video_embedder" in name:
return None

if not name.startswith("vision_model.radio_model.model.") and not name.startswith("mlp1."):
if not name.startswith(("vision_model.radio_model.model.", "mlp1.", "sound_encoder.", "sound_projection.")):
return None

if "patch_generator.pos_embed" in name:
if not name.endswith(".weight"):
name += ".weight"

return super().filter_tensors((name, gen))
# num_batches is only used for training not inference.
if "conv.norm" in name and "num_batches" in name:
return None

return name, gen

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
Expand Down Expand Up @@ -104,7 +128,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
n_embd = self.hparams["hidden_size"]
data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size)

yield from super().modify_tensors(data_torch, name, bid)
if "depthwise_conv.weight" in name:
data_torch = data_torch.unsqueeze(-1)
data_torch = data_torch.permute(3, 1, 0, 2).contiguous()

if "pointwise_conv" in name and name.endswith(".weight"):
if len(data_torch.shape) == 3 and data_torch.shape[2] == 1:
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])

if "subsampling.layers" in name and name.endswith(".bias"):
if len(data_torch.shape) == 1:
data_torch = data_torch.reshape(1, -1, 1, 1)

if "pointwise_conv" in name and name.endswith(".bias"):
if len(data_torch.shape) == 1:
data_torch = data_torch.reshape(1, -1, 1, 1)

for mapped_name, tensor in super().modify_tensors(data_torch, name, bid):
if name.startswith("sound_projection.") and mapped_name.startswith("mm.model.mlp."):
mapped_name = mapped_name.replace("mm.model.mlp.", "mm.a.mlp.")
yield mapped_name, tensor


@ModelBase.register("NemotronForCausalLM")
Expand Down
14 changes: 14 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,7 @@ class ClipAudio:
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
PROJECTION_DIM = "clip.audio.projection_dim"
BLOCK_COUNT = "clip.audio.block_count"
SUBSAMPLING_FACTOR = "clip.audio.subsampling_factor"
CHUNK_SIZE = "clip.audio.chunk_size"
CONV_KERNEL_SIZE = "clip.audio.conv_kernel_size"
MAX_POS_EMB = "clip.audio.max_pos_emb"
Expand Down Expand Up @@ -882,6 +883,10 @@ class MODEL_TENSOR(IntEnum):
A_ENC_CONV_NORM = auto() # SSM conv
A_ENC_CONV_PW1 = auto()
A_ENC_CONV_PW2 = auto()
A_ENC_CONV_NORM_MEAN = auto() # parakeet
A_ENC_CONV_NORM_VAR = auto() # parakeet
A_ENC_MEL_FILTERS = auto() # parakeet
A_ENC_WINDOW = auto() # parakeet
A_CTC_OUT = auto()
A_CTC_OUT_MID = auto()
A_ENC_ATTN_REL_POS_EMB = auto()
Expand Down Expand Up @@ -1396,6 +1401,10 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm",
MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1",
MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2",
MODEL_TENSOR.A_ENC_CONV_NORM_MEAN: "a.blk.{bid}.conv_norm_mean",
MODEL_TENSOR.A_ENC_CONV_NORM_VAR: "a.blk.{bid}.conv_norm_var",
MODEL_TENSOR.A_ENC_MEL_FILTERS: "a.mel_filters",
MODEL_TENSOR.A_ENC_WINDOW: "a.window",
MODEL_TENSOR.A_CTC_OUT: "a.enc_ctc_out",
MODEL_TENSOR.A_CTC_OUT_MID: "a.enc_ctc_out_mid",
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB: "a.blk.{bid}.attn_rel_pos_emb",
Expand Down Expand Up @@ -1569,6 +1578,10 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.A_ENC_CONV_NORM,
MODEL_TENSOR.A_ENC_CONV_PW1,
MODEL_TENSOR.A_ENC_CONV_PW2,
MODEL_TENSOR.A_ENC_CONV_NORM_MEAN,
MODEL_TENSOR.A_ENC_CONV_NORM_VAR,
MODEL_TENSOR.A_ENC_MEL_FILTERS,
MODEL_TENSOR.A_ENC_WINDOW,
MODEL_TENSOR.A_MM_INP_PROJ,
MODEL_TENSOR.A_MM_SOFT_EMB_NORM,
MODEL_TENSOR.A_MM_EMBEDDING,
Expand Down Expand Up @@ -4385,6 +4398,7 @@ class VisionProjectorType:
YOUTUVL = "youtuvl"
NEMOTRON_V2_VL = "nemotron_v2_vl"
HUNYUANVL = "hunyuanvl"
PARAKEET = "parakeet"
MINICPMV4_6 = "minicpmv4_6"
GRANITE_SPEECH = "granite_speech" # audio
MIMOVL = "mimovl"
Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1278,6 +1278,9 @@ def add_audio_num_mel_bins(self, value: int) -> None:
def add_audio_stack_factor(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)

def add_audio_subsampling_factor(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.SUBSAMPLING_FACTOR, value)

def add_audio_chunk_size(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.CHUNK_SIZE, value)

Expand Down
40 changes: 40 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1972,6 +1972,7 @@ class TensorNameMap:
"conformer.pre_encode.conv.{bid}", # lfm2
"model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
"conformer.subsample_conv_projection.layer{bid}.conv", # gemma4
"sound_encoder.encoder.subsampling.layers.{bid}", # parakeet
),

MODEL_TENSOR.A_ENC_CONV1D_NORM: (
Expand Down Expand Up @@ -2003,6 +2004,7 @@ class TensorNameMap:
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
"conformer.layers.{bid}.self_attn.q_proj", # gemma4
"sound_encoder.encoder.layers.{bid}.self_attn.q_proj", # parakeet
"encoder.layers.{bid}.attn.to_q", # granite_speech
),

Expand All @@ -2011,6 +2013,7 @@ class TensorNameMap:
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
"conformer.layers.{bid}.self_attn.k_proj", # gemma4
"sound_encoder.encoder.layers.{bid}.self_attn.k_proj", # parakeet
"encoder.layers.{bid}.attn.to_k", # granite_speech (split from to_kv)
),

Expand All @@ -2019,6 +2022,7 @@ class TensorNameMap:
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
"conformer.layers.{bid}.self_attn.v_proj", # gemma4
"sound_encoder.encoder.layers.{bid}.self_attn.v_proj", # parakeet
"encoder.layers.{bid}.attn.to_v", # granite_speech (split from to_kv)
),

Expand Down Expand Up @@ -2047,6 +2051,7 @@ class TensorNameMap:
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
"conformer.layers.{bid}.norm_self_att", # lfm2
"conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
"sound_encoder.encoder.layers.{bid}.norm_self_att", # parakeet
"encoder.layers.{bid}.attn.pre_norm", # granite_speech
),

Expand All @@ -2055,20 +2060,23 @@ class TensorNameMap:
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
"conformer.layers.{bid}.attention.post", # gemma3n
"conformer.layers.{bid}.self_attn.post", # gemma4
"sound_encoder.encoder.layers.{bid}.self_attn.o_proj", # parakeet
"encoder.layers.{bid}.attn.to_out", # granite_speech
),

MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
"conformer.layers.{bid}.norm_out", # lfm2
"conformer.layers.{bid}.attention.post_norm", # gemma3n
"sound_encoder.encoder.layers.{bid}.norm_out", # parakeet
"encoder.layers.{bid}.post_norm", # granite_speech
),

MODEL_TENSOR.A_ENC_FFN_NORM: (
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
"conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4
"sound_encoder.encoder.layers.{bid}.norm_feed_forward1", # parakeet
"encoder.layers.{bid}.ff1.pre_norm", # granite_speech
),

Expand All @@ -2086,6 +2094,7 @@ class TensorNameMap:
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
"conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4
"sound_encoder.encoder.layers.{bid}.feed_forward1.linear1", # parakeet
"encoder.layers.{bid}.ff1.up_proj", # granite_speech
),

Expand All @@ -2096,27 +2105,31 @@ class TensorNameMap:
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
"conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4
"sound_encoder.encoder.layers.{bid}.feed_forward1.linear2", # parakeet
"encoder.layers.{bid}.ff1.down_proj", # granite_speech
),

MODEL_TENSOR.A_ENC_FFN_UP_1: (
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
"conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4
"sound_encoder.encoder.layers.{bid}.feed_forward2.linear1", # parakeet
"encoder.layers.{bid}.ff2.up_proj", # granite_speech
),

MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
"conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4
"sound_encoder.encoder.layers.{bid}.feed_forward2.linear2", # parakeet
"encoder.layers.{bid}.ff2.down_proj", # granite_speech
),

MODEL_TENSOR.A_ENC_FFN_NORM_1: (
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
"conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4
"sound_encoder.encoder.layers.{bid}.norm_feed_forward2", # parakeet
"encoder.layers.{bid}.ff2.pre_norm", # granite_speech
),

Expand All @@ -2132,20 +2145,24 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_LINEAR_POS: (
"conformer.layers.{bid}.self_attn.linear_pos", # lfm2
"conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n
"sound_encoder.encoder.layers.{bid}.self_attn.relative_k_proj", # parakeet
),

MODEL_TENSOR.A_ENC_POS_BIAS_U: (
"conformer.layers.{bid}.self_attn.pos_bias_u", # lfm2
"sound_encoder.encoder.layers.{bid}.self_attn.bias_u", # parakeet
),

MODEL_TENSOR.A_ENC_POS_BIAS_V: (
"conformer.layers.{bid}.self_attn.pos_bias_v", # lfm2
"sound_encoder.encoder.layers.{bid}.self_attn.bias_v", # parakeet
),

MODEL_TENSOR.A_ENC_OUT: (
"conformer.pre_encode.out", # lfm2
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n (note: it should be A_ENC_INP_PROJ, this is a mistake; it should be corrected in C++ code when it's supported)
"conformer.output_proj", # gemma4
"sound_encoder.encoder.subsampling.linear", # parakeet
),

# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
Expand All @@ -2155,6 +2172,7 @@ class TensorNameMap:
"audio.multi_modal_projector.linear_{bid}", # ultravox, meralion
"audio_adapter.model.{bid}", # lfm2
"audio_tower.proj{bid}", # qwen3omni
"sound_projection.linear{bid}", # parakeet (linear1, linear2)
),

MODEL_TENSOR.A_MMPROJ_FC: (
Expand All @@ -2165,6 +2183,7 @@ class TensorNameMap:

MODEL_TENSOR.A_MM_NORM_PRE: (
"audio.multi_modal_projector.ln_pre", # ultravox
"sound_projection.norm", # parakeet
),

MODEL_TENSOR.A_MM_NORM_MID: (
Expand All @@ -2174,30 +2193,43 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_CONV_DW: (
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
"conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
"sound_encoder.encoder.layers.{bid}.conv.depthwise_conv", # parakeet
"encoder.layers.{bid}.conv.depth_conv.conv", # granite_speech
),

MODEL_TENSOR.A_ENC_CONV_NORM: (
"conformer.layers.{bid}.conv.batch_norm", # lfm2
"conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
"sound_encoder.encoder.layers.{bid}.conv.norm", # parakeet
),

MODEL_TENSOR.A_ENC_CONV_NORM_MEAN: (
"sound_encoder.encoder.layers.{bid}.conv.norm.running_mean", # parakeet
),

MODEL_TENSOR.A_ENC_CONV_NORM_VAR: (
"sound_encoder.encoder.layers.{bid}.conv.norm.running_var", # parakeet
"encoder.layers.{bid}.conv.batch_norm", # granite_speech
),

MODEL_TENSOR.A_ENC_CONV_PW1: (
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
"conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
"sound_encoder.encoder.layers.{bid}.conv.pointwise_conv1", # parakeet
"encoder.layers.{bid}.conv.up_conv", # granite_speech
),

MODEL_TENSOR.A_ENC_CONV_PW2: (
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
"conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
"sound_encoder.encoder.layers.{bid}.conv.pointwise_conv2", # parakeet
"encoder.layers.{bid}.conv.down_conv", # granite_speech
),

MODEL_TENSOR.A_ENC_NORM_CONV: (
"conformer.layers.{bid}.norm_conv", # lfm2
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
"sound_encoder.encoder.layers.{bid}.norm_conv", # parakeet
"encoder.layers.{bid}.conv.norm", # granite_speech
),

Expand All @@ -2209,6 +2241,14 @@ class TensorNameMap:
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma4
),

MODEL_TENSOR.A_ENC_MEL_FILTERS: (
"sound_encoder.encoder.feature_extractor.featurizer.fb", # parakeet
),

MODEL_TENSOR.A_ENC_WINDOW: (
"sound_encoder.encoder.feature_extractor.featurizer.window", # parakeet
),

MODEL_TENSOR.A_MM_EMBEDDING: (
"model.embed_audio.embedding", # gemma3n
),
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ add_library(mtmd
models/mobilenetv5.cpp
models/youtuvl.cpp
models/yasa2.cpp
models/parakeet.cpp
)

set_target_properties(mtmd PROPERTIES
Expand Down
Loading
Loading