Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@
logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
for name, remote_tensor in remote_tensors.items():
tensors[name] = lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r)

Check failure on line 195 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"LazyTorchTensor" is not defined (reportUndefinedVariable)

Check failure on line 195 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"LazyTorchTensor" is not defined (reportUndefinedVariable)

return tensors

Expand Down Expand Up @@ -239,14 +239,14 @@
if is_safetensors:
data: gguf.utility.LocalTensor = model_part[name]
if self.lazy:
data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data) # noqa: E731

Check failure on line 242 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"LazyTorchTensor" is not defined (reportUndefinedVariable)

Check failure on line 242 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"LazyTorchTensor" is not defined (reportUndefinedVariable)
else:
dtype = LazyTorchTensor._dtype_str_map[data.dtype]

Check failure on line 244 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"LazyTorchTensor" is not defined (reportUndefinedVariable)

Check failure on line 244 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"LazyTorchTensor" is not defined (reportUndefinedVariable)
data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape) # noqa: E731
else:
data_torch: Tensor = model_part[name]
if self.lazy:
data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data) # noqa: E731

Check failure on line 249 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"LazyTorchTensor" is not defined (reportUndefinedVariable)

Check failure on line 249 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"LazyTorchTensor" is not defined (reportUndefinedVariable)
else:
data_gen = lambda data=data_torch: data # noqa: E731
tensors[name] = data_gen
Expand Down Expand Up @@ -312,7 +312,7 @@
pack_factor = pack_dtype_bits // bits
wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0)
if self.lazy:
wf = LazyTorchTensor.from_eager(wf)

Check failure on line 315 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"LazyTorchTensor" is not defined (reportUndefinedVariable)

Check failure on line 315 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"LazyTorchTensor" is not defined (reportUndefinedVariable)

zeros = torch.bitwise_right_shift(
qzeros.unsqueeze(2).expand(-1, -1, pack_factor),
Expand Down Expand Up @@ -349,7 +349,7 @@

shifts = torch.arange(0, 32 - (num_bits - 1), num_bits, dtype=torch.int32)
if self.lazy:
shifts = LazyTorchTensor.from_eager(shifts)

Check failure on line 352 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"LazyTorchTensor" is not defined (reportUndefinedVariable)

Check failure on line 352 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"LazyTorchTensor" is not defined (reportUndefinedVariable)

if zero_point is None:
offset = 1 << (num_bits - 1)
Expand Down Expand Up @@ -765,7 +765,7 @@
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if not self.is_mistral_format:
self.hf_arch = get_model_architecture(self.hparams, self.model_type)

Check failure on line 768 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"get_model_architecture" is not defined (reportUndefinedVariable)

Check failure on line 768 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"get_model_architecture" is not defined (reportUndefinedVariable)
else:
self.hf_arch = ""

Expand Down Expand Up @@ -1719,7 +1719,7 @@
"Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
"Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
)
template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)

Check failure on line 1722 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"MistralModel" is not defined (reportUndefinedVariable)

Check failure on line 1722 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"MistralModel" is not defined (reportUndefinedVariable)
else:
logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
template = None
Expand Down Expand Up @@ -5086,9 +5086,23 @@
super().set_gguf_parameters()
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])

<<<<<<< HEAD

Check failure on line 5089 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

Expected expression

Check failure on line 5089 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

Expected expression
if (score_func := self.find_hparam(["moe_router_activation_func"], optional=True)) is not None:

Check failure on line 5090 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

Unexpected indentation

Check failure on line 5090 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

Unexpected indentation
if score_func == "sigmoid":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
elif score_func == "softmax":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
else:
raise ValueError(f"Unsupported expert score gating function value: {score_func}")

# KDA & MLA params
# Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
linear_attn_config = self.hparams.get("linear_attn_config", {})
=======
# KDA & MLA params
# Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
linear_attn_config = self.hparams["linear_attn_config"]
>>>>>>> 3688c4f504f8e336663157bcc6e0af78d617420c
# n_head == 0 for KDA layers, n_head > 0 for MLA layers
# full_attention_layers list will be used to distingush layer type
_num_kv_heads = list()
Expand All @@ -5108,15 +5122,34 @@

# MLA params - use add_* methods that handle arch substitution
# Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv)
<<<<<<< HEAD
if (q_lora_rank := self.find_hparam(["q_lora_rank", "n_lora_q"], optional=False)) is not None:
self.gguf_writer.add_q_lora_rank(q_lora_rank)
if (kv_lora_rank := self.find_hparam(["kv_lora_rank", "n_lora_kv"], optional=False)) is not None:
self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
=======
if (q_lora_rank := self.find_hparam(["q_lora_rank", "n_lora_q"], optional=True)) is not None:
self.gguf_writer.add_q_lora_rank(q_lora_rank)
# To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA
kv_lora_rank = self.find_hparam(["kv_lora_rank", "n_lora_kv"], optional=False)
self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
>>>>>>> 3688c4f504f8e336663157bcc6e0af78d617420c

# MLA head dimensions
# Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
<<<<<<< HEAD
qk_rope_head_dim = self.hparams.get("qk_rope_head_dim")
v_head_dim = self.hparams.get("v_head_dim")
# To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA
self.gguf_writer.add_key_length(self.hparams["kv_lora_rank"] + self.hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length(self.hparams["kv_lora_rank"])

# Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
if "n_embd_head_k_mla" in self.hparams:
self.gguf_writer.add_key_length_mla(self.hparams["n_embd_head_k_mla"])
elif qk_nope_head_dim is not None and qk_rope_head_dim is not None:
=======
# Rotation - use qk_rope_head_dim for Kimi
qk_rope_head_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"], optional=False)
self.gguf_writer.add_rope_dimension_count(qk_rope_head_dim)
Expand All @@ -5127,10 +5160,46 @@
if (n_embd_head_k_mla := self.find_hparam(["n_embd_head_k_mla"], optional=True)) is not None:
self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)
elif qk_nope_head_dim is not None:
>>>>>>> 3688c4f504f8e336663157bcc6e0af78d617420c
n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)

# n_embd_head_v_mla = v_head_dim
<<<<<<< HEAD
if "n_embd_head_v_mla" in self.hparams:
self.gguf_writer.add_value_length_mla(self.hparams["n_embd_head_v_mla"])
elif v_head_dim is not None:
self.gguf_writer.add_value_length_mla(v_head_dim)

# Rotation - use qk_rope_head_dim for Kimi
if (rope_dim := self.find_hparam(["qk_rope_head_dim", "n_rot"], optional=True)) is not None:
self.gguf_writer.add_rope_dimension_count(rope_dim)
else:
# Default to head_dim
head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(head_dim)

if (n_experts := self.find_hparam(["num_experts"], optional=False)) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (n_experts_used := self.find_hparam(["num_experts_per_token"], optional=False)) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)

# moe_intermediate_size (1024 for Kimi)
if (moe_intermediate_size := self.find_hparam(["moe_intermediate_size"], optional=False)) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)

# num_shared_experts (1 for Kimi)
if (num_shared_experts := self.find_hparam(["num_shared_experts"], optional=False)) is not None:
self.gguf_writer.add_expert_shared_count(num_shared_experts)

# first_k_dense_replace (1 for Kimi - first layer uses dense MLP)
if (first_k_dense_replace := self.find_hparam(["first_k_dense_replace"])) is not None:
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)

# Routed scaling factor (expert_weights_scale = 2.446 for Kimi)
if (routed_scaling_factor := self.find_hparam(["routed_scaling_factor"], optional=False)) is not None:
self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
=======
if (n_embd_head_v_mla := self.hparams.get("n_embd_head_v_mla")) is not None:
self.gguf_writer.add_value_length_mla(n_embd_head_v_mla)
elif v_head_dim is not None:
Expand All @@ -5144,6 +5213,7 @@
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
# Routed scaling factor (expert_weights_scale = 2.446 for Kimi)
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
>>>>>>> 3688c4f504f8e336663157bcc6e0af78d617420c

def prepare_tensors(self):
super().prepare_tensors()
Expand Down Expand Up @@ -5189,7 +5259,11 @@

# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
<<<<<<< HEAD
n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
=======
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=False)
>>>>>>> 3688c4f504f8e336663157bcc6e0af78d617420c
assert bid is not None

if self._experts is None:
Expand All @@ -5199,6 +5273,10 @@

if len(self._experts[bid]) >= n_experts * 3:
# merge the experts into a single 3d tensor
<<<<<<< HEAD
tensors = []
=======
>>>>>>> 3688c4f504f8e336663157bcc6e0af78d617420c
# w1: gate, w2: down, w3: up
for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP),
("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP),
Expand All @@ -5208,28 +5286,49 @@
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
<<<<<<< HEAD

data_torch = torch.stack(datas, dim=0)
new_name = self.format_tensor_name(tname, bid)
tensors.append((new_name, data_torch))
return tensors
return []
=======
data_torch = torch.stack(datas, dim=0)
new_name = self.format_tensor_name(tname, bid)
yield from super().modify_tensors(data_torch, new_name, bid)
return
>>>>>>> 3688c4f504f8e336663157bcc6e0af78d617420c

# note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
if name.endswith("kv_b_proj.weight"):
name_kb = name.replace("kv_b_proj", "k_b_proj")
name_vb = name.replace("kv_b_proj", "v_b_proj")
n_head_kv = self.hparams["num_key_value_heads"]
<<<<<<< HEAD
v_head_dim = self.hparams["v_head_dim"]
=======
v_head_dim = self.find_hparam(["n_embd_head_v_mla", "v_head_dim"], optional=False)
>>>>>>> 3688c4f504f8e336663157bcc6e0af78d617420c
qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
logger.info("Split kv_b n_head_kv %d\n" % n_head_kv)
assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
k_b = k_b.transpose(1, 2)
<<<<<<< HEAD
return [(self.map_tensor_name(name_kb), k_b), (self.map_tensor_name(name_vb), v_b)]

mapped_name = self.map_tensor_name(name)
logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}")
return [(mapped_name, data_torch)]
=======
yield from super().modify_tensors(k_b, name_kb, bid)
yield from super().modify_tensors(v_b, name_vb, bid)
return

yield from super().modify_tensors(data_torch, name, bid)
>>>>>>> 3688c4f504f8e336663157bcc6e0af78d617420c


@ModelBase.register("InternLM2ForCausalLM")
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ add_library(llama
models/xverse.cpp
models/mistral3.cpp
models/graph-context-mamba.cpp
models/graph-context-delta.cpp
)

set_target_properties(llama PROPERTIES
Expand Down
Loading
Loading