Skip to content

Commit abd7d51

Browse files
author
Siyuan Feng
authored
Support Qwen2-MoE Architecture (#2089)
1 parent c1628dd commit abd7d51

File tree

8 files changed

+595
-11
lines changed

8 files changed

+595
-11
lines changed

python/mlc_llm/model/model.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from .phi3 import phi3_loader, phi3_model, phi3_quantization
2828
from .qwen import qwen_loader, qwen_model, qwen_quantization
2929
from .qwen2 import qwen2_loader, qwen2_model, qwen2_quantization
30+
from .qwen2_moe import qwen2_moe_loader, qwen2_moe_model, qwen2_moe_quantization
3031
from .rwkv5 import rwkv5_loader, rwkv5_model, rwkv5_quantization
3132
from .rwkv6 import rwkv6_loader, rwkv6_model, rwkv6_quantization
3233
from .stable_lm import stablelm_loader, stablelm_model, stablelm_quantization
@@ -246,6 +247,20 @@ class Model:
246247
"ft-quant": qwen2_quantization.ft_quant,
247248
},
248249
),
250+
"qwen2_moe": Model(
251+
name="qwen2_moe",
252+
model=qwen2_moe_model.Qwen2MoeForCausalLM,
253+
config=qwen2_moe_model.Qwen2MoeConfig,
254+
source={
255+
"huggingface-torch": qwen2_moe_loader.huggingface,
256+
"huggingface-safetensor": qwen2_moe_loader.huggingface,
257+
},
258+
quantize={
259+
"no-quant": qwen2_moe_quantization.no_quant,
260+
"group-quant": qwen2_moe_quantization.group_quant,
261+
"ft-quant": qwen2_moe_quantization.ft_quant,
262+
},
263+
),
249264
"stablelm": Model(
250265
name="stablelm",
251266
model=stablelm_model.StableLmForCausalLM,

python/mlc_llm/model/model_preset.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,39 @@
449449
"use_sliding_window": False,
450450
"vocab_size": 151936,
451451
},
452+
"qwen2moe": {
453+
"architectures": ["Qwen2MoeForCausalLM"],
454+
"attention_dropout": 0.0,
455+
"bos_token_id": 151643,
456+
"eos_token_id": 151645,
457+
"hidden_act": "silu",
458+
"hidden_size": 2048,
459+
"initializer_range": 0.02,
460+
"intermediate_size": 5632,
461+
"max_position_embeddings": 32768,
462+
"max_window_layers": 21,
463+
"model_type": "qwen2_moe",
464+
"num_attention_heads": 16,
465+
"num_hidden_layers": 24,
466+
"num_key_value_heads": 16,
467+
"rms_norm_eps": 1e-06,
468+
"rope_theta": 1000000.0,
469+
"sliding_window": 32768,
470+
"tie_word_embeddings": False,
471+
"torch_dtype": "bfloat16",
472+
"transformers_version": "4.39.0.dev0",
473+
"use_cache": True,
474+
"use_sliding_window": False,
475+
"vocab_size": 151936,
476+
"decoder_sparse_step": 1,
477+
"moe_intermediate_size": 1408,
478+
"shared_expert_intermediate_size": 5632,
479+
"num_experts_per_tok": 4,
480+
"num_experts": 60,
481+
"norm_topk_prob": False,
482+
"output_router_logits": False,
483+
"router_aux_loss_coef": 0.001,
484+
},
452485
"stablelm": {
453486
"architectures": ["StableLmForCausalLM"],
454487
"bos_token_id": 0,

python/mlc_llm/model/qwen2_moe/__init__.py

Whitespace-only changes.
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
"""
2+
This file specifies how MLC's QWen2 parameter maps from other formats, for example HuggingFace
3+
PyTorch, HuggingFace safetensors.
4+
"""
5+
6+
import functools
7+
8+
import numpy as np
9+
10+
from mlc_llm.loader import ExternMapping
11+
from mlc_llm.quantization import Quantization
12+
13+
from .qwen2_moe_model import Qwen2MoeConfig, Qwen2MoeForCausalLM
14+
15+
16+
def huggingface(model_config: Qwen2MoeConfig, quantization: Quantization) -> ExternMapping:
17+
"""Returns a parameter mapping that maps from the names of MLC LLM parameters to
18+
the names of HuggingFace PyTorch parameters.
19+
20+
Parameters
21+
----------
22+
model_config : QWen2Config
23+
The configuration of the GPT-2 model.
24+
25+
quantization : Quantization
26+
The quantization configuration.
27+
28+
Returns
29+
-------
30+
param_map : ExternMapping
31+
The parameter mapping from MLC to HuggingFace PyTorch.
32+
"""
33+
model = Qwen2MoeForCausalLM(model_config)
34+
if quantization is not None:
35+
model.to(quantization.model_dtype)
36+
_, _named_params, _ = model.export_tvm( # type: ignore[misc]
37+
spec=model.get_default_spec(),
38+
allow_extern=True,
39+
)
40+
named_parameters = dict(_named_params)
41+
42+
mapping = ExternMapping()
43+
44+
for i in range(model_config.num_hidden_layers):
45+
# map attention weight
46+
attn = f"model.layers.{i}.self_attn"
47+
for weight_type in ["weight", "bias"]:
48+
mlc_name = f"{attn}.c_attn.{weight_type}"
49+
mlc_param = named_parameters[mlc_name]
50+
mapping.add_mapping(
51+
mlc_name,
52+
[
53+
f"{attn}.q_proj.{weight_type}",
54+
f"{attn}.k_proj.{weight_type}",
55+
f"{attn}.v_proj.{weight_type}",
56+
],
57+
functools.partial(
58+
lambda q, k, v, dtype: np.concatenate([q, k, v], axis=0).astype(dtype),
59+
dtype=mlc_param.dtype,
60+
),
61+
)
62+
# map mlp shared expert weight
63+
mlp = f"model.layers.{i}.mlp"
64+
shared_expert = f"{mlp}.shared_expert"
65+
mlc_name = f"{shared_expert}.gate_up_proj.weight"
66+
mlc_param = named_parameters[mlc_name]
67+
mapping.add_mapping(
68+
mlc_name,
69+
[
70+
f"{shared_expert}.gate_proj.weight",
71+
f"{shared_expert}.up_proj.weight",
72+
],
73+
functools.partial(
74+
lambda gate, up, dtype: np.concatenate([gate, up], axis=0).astype(dtype),
75+
dtype=mlc_param.dtype,
76+
),
77+
)
78+
# map mlp moe gate and up weight
79+
mlc_name = f"{mlp}.moe_gate_up_proj.weight"
80+
81+
def combine_expert_gate_up(*hf_params, dtype):
82+
stack = []
83+
for i in range(0, len(hf_params), 2):
84+
stack.append(np.concatenate([hf_params[i], hf_params[i + 1]], axis=0))
85+
return np.stack(stack, axis=0).astype(dtype)
86+
87+
mapping.add_mapping(
88+
mlc_name,
89+
functools.reduce(
90+
lambda a, b: a + b,
91+
[
92+
[
93+
f"{mlp}.experts.{expert_id}.gate_proj.weight",
94+
f"{mlp}.experts.{expert_id}.up_proj.weight",
95+
]
96+
for expert_id in range(model_config.num_experts)
97+
],
98+
),
99+
functools.partial(
100+
combine_expert_gate_up,
101+
dtype=mlc_param.dtype,
102+
),
103+
)
104+
105+
# map mlp moe gate and up weight
106+
mlc_name = f"{mlp}.moe_down_proj.weight"
107+
mlc_param = named_parameters[mlc_name]
108+
mapping.add_mapping(
109+
mlc_name,
110+
[
111+
f"{mlp}.experts.{expert_id}.down_proj.weight"
112+
for expert_id in range(model_config.num_experts)
113+
],
114+
functools.partial(
115+
lambda *hf_params, dtype: np.stack(hf_params, axis=0).astype(dtype),
116+
dtype=mlc_param.dtype,
117+
),
118+
)
119+
120+
for mlc_name, mlc_param in named_parameters.items():
121+
if mlc_name not in mapping.param_map:
122+
mapping.add_mapping(
123+
mlc_name,
124+
[mlc_name],
125+
functools.partial(
126+
lambda x, dtype: x.astype(dtype),
127+
dtype=mlc_param.dtype,
128+
),
129+
)
130+
return mapping

0 commit comments

Comments
 (0)