Skip to content

Commit 0db5fad

Browse files
committed
Revert "nvidia nemotron nano v2 (nemotronh) (ggml-org#15507)"
1 parent 6d85171 commit 0db5fad

File tree

7 files changed

+11
-362
lines changed

7 files changed

+11
-362
lines changed

convert_hf_to_gguf.py

Lines changed: 5 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -7546,13 +7546,9 @@ def __init__(self, *args, **kwargs):
75467546
]
75477547

75487548
# n_group and d_inner are used during reshape_tensors for mamba2
7549-
# NOTE: Explicitly include hparam prefix prefix for d_model to
7550-
# disambiguate with top-level head_dim
7551-
# NOTE 2: If needed for future models, this can be isolated in a method
7552-
# to separate the prefix setting and teh keys used
7553-
self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"])
7554-
self.n_group = self.find_hparam(["n_groups", "num_groups"])
7555-
self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model
7549+
self.d_model = self.find_hparam(["hidden_size", "d_model"])
7550+
self.n_group = self.find_hparam(["n_groups"])
7551+
self.d_inner = self.find_hparam(["expand"]) * self.d_model
75567552

75577553
def get_attn_layers(self):
75587554
# Explicit list of layer type names
@@ -7613,12 +7609,12 @@ def set_gguf_parameters(self):
76137609

76147610
## Mamba mixer params ##
76157611
self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
7616-
self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state", "state_dim", "ssm_state_size"]))
7612+
self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state"]))
76177613
self.gguf_writer.add_ssm_group_count(self.n_group)
76187614
self.gguf_writer.add_ssm_inner_size(self.d_inner)
76197615
# NOTE: The mamba_dt_rank is _not_ the right field for how this is used
76207616
# in llama.cpp
7621-
self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads", "num_heads"]))
7617+
self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
76227618

76237619
## Attention params ##
76247620
head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
@@ -7645,55 +7641,6 @@ def set_vocab(self):
76457641
Mamba2Model.set_vocab(self)
76467642

76477643

7648-
@ModelBase.register("NemotronHForCausalLM")
7649-
class NemotronHModel(GraniteHybridModel):
7650-
"""Hybrid mamba2/attention model from NVIDIA"""
7651-
model_arch = gguf.MODEL_ARCH.NEMOTRON_H
7652-
7653-
def __init__(self, *args, **kwargs):
7654-
super().__init__(*args, **kwargs)
7655-
7656-
# Save the top-level head_dim for later
7657-
self.head_dim = self.hparams.get("head_dim", self.hparams.get("attention_head_dim"))
7658-
assert self.head_dim is not None, "Could not find the attention head dim in config"
7659-
7660-
# Don't use expand to calculate d_inner
7661-
self.d_inner = self.find_hparam(["num_heads"]) * self.d_model
7662-
7663-
# Update the ssm / attn / mlp layers
7664-
# M: Mamba2, *: Attention, -: MLP
7665-
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
7666-
self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
7667-
self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "-"]
7668-
7669-
def get_attn_layers(self):
7670-
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
7671-
assert len(hybrid_override_pattern) == self.block_count, "Mismatch between hybrid override and num_hidden_layers!"
7672-
return [i for i, val in enumerate(hybrid_override_pattern) if val == "*"]
7673-
7674-
def set_gguf_parameters(self):
7675-
super().set_gguf_parameters()
7676-
7677-
self.gguf_writer.add_key_length(self.head_dim)
7678-
self.gguf_writer.add_value_length(self.head_dim)
7679-
7680-
# Set feed_forward_length
7681-
# NOTE: This will trigger an override warning. This is preferrable to
7682-
# duplicating all the parent logic
7683-
n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
7684-
self.gguf_writer.add_feed_forward_length([
7685-
n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
7686-
])
7687-
7688-
def set_vocab(self):
7689-
super().set_vocab()
7690-
7691-
# The tokenizer _does_ add a BOS token (via post_processor type
7692-
# TemplateProcessing) but does not set add_bos_token to true in the
7693-
# config, so we need to explicitly override it here.
7694-
self.gguf_writer.add_add_bos_token(True)
7695-
7696-
76977644
@ModelBase.register("BailingMoeForCausalLM")
76987645
class BailingMoeModel(TextModel):
76997646
model_arch = gguf.MODEL_ARCH.BAILINGMOE

gguf-py/gguf/constants.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,6 @@ class MODEL_ARCH(IntEnum):
367367
T5ENCODER = auto()
368368
JAIS = auto()
369369
NEMOTRON = auto()
370-
NEMOTRON_H = auto()
371370
EXAONE = auto()
372371
EXAONE4 = auto()
373372
GRANITE = auto()
@@ -701,7 +700,6 @@ class MODEL_TENSOR(IntEnum):
701700
MODEL_ARCH.T5ENCODER: "t5encoder",
702701
MODEL_ARCH.JAIS: "jais",
703702
MODEL_ARCH.NEMOTRON: "nemotron",
704-
MODEL_ARCH.NEMOTRON_H: "nemotron_h",
705703
MODEL_ARCH.EXAONE: "exaone",
706704
MODEL_ARCH.EXAONE4: "exaone4",
707705
MODEL_ARCH.GRANITE: "granite",
@@ -2299,25 +2297,6 @@ class MODEL_TENSOR(IntEnum):
22992297
MODEL_TENSOR.FFN_DOWN,
23002298
MODEL_TENSOR.FFN_UP,
23012299
],
2302-
MODEL_ARCH.NEMOTRON_H: [
2303-
MODEL_TENSOR.TOKEN_EMBD,
2304-
MODEL_TENSOR.OUTPUT_NORM,
2305-
MODEL_TENSOR.OUTPUT,
2306-
MODEL_TENSOR.ATTN_NORM,
2307-
MODEL_TENSOR.SSM_IN,
2308-
MODEL_TENSOR.SSM_CONV1D,
2309-
MODEL_TENSOR.SSM_DT,
2310-
MODEL_TENSOR.SSM_A,
2311-
MODEL_TENSOR.SSM_D,
2312-
MODEL_TENSOR.SSM_NORM,
2313-
MODEL_TENSOR.SSM_OUT,
2314-
MODEL_TENSOR.ATTN_Q,
2315-
MODEL_TENSOR.ATTN_K,
2316-
MODEL_TENSOR.ATTN_V,
2317-
MODEL_TENSOR.ATTN_OUT,
2318-
MODEL_TENSOR.FFN_DOWN,
2319-
MODEL_TENSOR.FFN_UP,
2320-
],
23212300
MODEL_ARCH.EXAONE: [
23222301
MODEL_TENSOR.TOKEN_EMBD,
23232302
MODEL_TENSOR.OUTPUT_NORM,

gguf-py/gguf/tensor_mapping.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,6 @@ class TensorNameMap:
191191
"model.layers.{bid}.self_attn.q_proj", # llama4
192192
"model.transformer.blocks.{bid}.q_proj", # llada
193193
"layers.{bid}.self_attn.q_proj", # qwen3-embedding
194-
"backbone.layers.{bid}.mixer.q_proj", # nemotron-h
195194
),
196195

197196
# Attention key
@@ -210,7 +209,6 @@ class TensorNameMap:
210209
"model.layers.{bid}.self_attn.k_proj", # llama4
211210
"model.transformer.blocks.{bid}.k_proj", # llada
212211
"layers.{bid}.self_attn.k_proj", # qwen3-embedding
213-
"backbone.layers.{bid}.mixer.k_proj", # nemotron-h
214212
),
215213

216214
# Attention value
@@ -228,7 +226,6 @@ class TensorNameMap:
228226
"model.layers.{bid}.self_attn.v_proj", # llama4
229227
"model.transformer.blocks.{bid}.v_proj", # llada
230228
"layers.{bid}.self_attn.v_proj", # qwen3-embedding
231-
"backbone.layers.{bid}.mixer.v_proj", # nemotron-h
232229
),
233230

234231
# Attention output
@@ -263,7 +260,6 @@ class TensorNameMap:
263260
"transformer_encoder.{bid}.wo", # neobert
264261
"model.transformer.blocks.{bid}.attn_out", # llada
265262
"layers.{bid}.self_attn.o_proj", # qwen3-embedding
266-
"backbone.layers.{bid}.mixer.o_proj", # nemotron-h
267263
),
268264

269265
# Attention output norm
@@ -391,7 +387,6 @@ class TensorNameMap:
391387
"model.layers.{bid}.block_sparse_moe.up", # smallthinker
392388
"model.transformer.blocks.{bid}.up_proj", # llada
393389
"layers.{bid}.mlp.up_proj", # qwen3-embedding
394-
"backbone.layers.{bid}.mixer.up_proj", # nemotron-h
395390
),
396391

397392
MODEL_TENSOR.FFN_UP_EXP: (
@@ -485,7 +480,6 @@ class TensorNameMap:
485480
"model.layers.{bid}.block_sparse_moe.down", # smallthinker
486481
"model.transformer.blocks.{bid}.ff_out", # llada
487482
"layers.{bid}.mlp.down_proj", # qwen3-embedding
488-
"backbone.layers.{bid}.mixer.down_proj", # nemotron-h
489483
),
490484

491485
MODEL_TENSOR.FFN_DOWN_EXP: (

src/llama-arch.cpp

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
6969
{ LLM_ARCH_T5ENCODER, "t5encoder" },
7070
{ LLM_ARCH_JAIS, "jais" },
7171
{ LLM_ARCH_NEMOTRON, "nemotron" },
72-
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
7372
{ LLM_ARCH_EXAONE, "exaone" },
7473
{ LLM_ARCH_EXAONE4, "exaone4" },
7574
{ LLM_ARCH_RWKV6, "rwkv6" },
@@ -1551,31 +1550,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
15511550
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
15521551
},
15531552
},
1554-
{
1555-
LLM_ARCH_NEMOTRON_H,
1556-
{
1557-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1558-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1559-
{ LLM_TENSOR_OUTPUT, "output" },
1560-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1561-
// mamba(2) ssm layers
1562-
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1563-
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1564-
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1565-
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1566-
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1567-
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
1568-
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1569-
// attention layers
1570-
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1571-
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1572-
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1573-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1574-
// dense FFN
1575-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1576-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1577-
},
1578-
},
15791553
{
15801554
LLM_ARCH_EXAONE,
15811555
{
@@ -2381,7 +2355,6 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
23812355
case LLM_ARCH_PLAMO2:
23822356
case LLM_ARCH_GRANITE_HYBRID:
23832357
case LLM_ARCH_LFM2:
2384-
case LLM_ARCH_NEMOTRON_H:
23852358
return true;
23862359
default:
23872360
return false;

src/llama-arch.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ enum llm_arch {
7373
LLM_ARCH_T5ENCODER,
7474
LLM_ARCH_JAIS,
7575
LLM_ARCH_NEMOTRON,
76-
LLM_ARCH_NEMOTRON_H,
7776
LLM_ARCH_EXAONE,
7877
LLM_ARCH_EXAONE4,
7978
LLM_ARCH_RWKV6,

src/llama-model-loader.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -793,7 +793,9 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
793793
}
794794

795795
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
796-
// LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
796+
797+
// LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
798+
797799
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
798800

799801
if (cur == NULL) {

0 commit comments

Comments
 (0)