mlc-ai
diff --git a/‎python/mlc_llm/model/model.py‎
Lines changed: 15 additions & 0 deletions b/‎python/mlc_llm/model/model.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎python/mlc_llm/model/model_preset.py‎
Lines changed: 33 additions & 0 deletions b/‎python/mlc_llm/model/model_preset.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎python/mlc_llm/model/qwen2_moe/__init__.py‎ b/‎python/mlc_llm/model/qwen2_moe/__init__.py‎
diff --git a/‎python/mlc_llm/model/qwen2_moe/qwen2_moe_loader.py‎
Lines changed: 130 additions & 0 deletions b/‎python/mlc_llm/model/qwen2_moe/qwen2_moe_loader.py‎
Lines changed: 130 additions & 0 deletions
@@ -27,6 +27,7 @@
 from .phi3 import phi3_loader, phi3_model, phi3_quantization
 from .qwen import qwen_loader, qwen_model, qwen_quantization
 from .qwen2 import qwen2_loader, qwen2_model, qwen2_quantization
+from .qwen2_moe import qwen2_moe_loader, qwen2_moe_model, qwen2_moe_quantization
 from .rwkv5 import rwkv5_loader, rwkv5_model, rwkv5_quantization
 from .rwkv6 import rwkv6_loader, rwkv6_model, rwkv6_quantization
 from .stable_lm import stablelm_loader, stablelm_model, stablelm_quantization
@@ -246,6 +247,20 @@ class Model:
             "ft-quant": qwen2_quantization.ft_quant,
         },
     ),
+    "qwen2_moe": Model(
+        name="qwen2_moe",
+        model=qwen2_moe_model.Qwen2MoeForCausalLM,
+        config=qwen2_moe_model.Qwen2MoeConfig,
+        source={
+            "huggingface-torch": qwen2_moe_loader.huggingface,
+            "huggingface-safetensor": qwen2_moe_loader.huggingface,
+        },
+        quantize={
+            "no-quant": qwen2_moe_quantization.no_quant,
+            "group-quant": qwen2_moe_quantization.group_quant,
+            "ft-quant": qwen2_moe_quantization.ft_quant,
+        },
+    ),
     "stablelm": Model(
         name="stablelm",
         model=stablelm_model.StableLmForCausalLM,
 
@@ -449,6 +449,39 @@
         "use_sliding_window": False,
         "vocab_size": 151936,
     },
+    "qwen2moe": {
+        "architectures": ["Qwen2MoeForCausalLM"],
+        "attention_dropout": 0.0,
+        "bos_token_id": 151643,
+        "eos_token_id": 151645,
+        "hidden_act": "silu",
+        "hidden_size": 2048,
+        "initializer_range": 0.02,
+        "intermediate_size": 5632,
+        "max_position_embeddings": 32768,
+        "max_window_layers": 21,
+        "model_type": "qwen2_moe",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "num_key_value_heads": 16,
+        "rms_norm_eps": 1e-06,
+        "rope_theta": 1000000.0,
+        "sliding_window": 32768,
+        "tie_word_embeddings": False,
+        "torch_dtype": "bfloat16",
+        "transformers_version": "4.39.0.dev0",
+        "use_cache": True,
+        "use_sliding_window": False,
+        "vocab_size": 151936,
+        "decoder_sparse_step": 1,
+        "moe_intermediate_size": 1408,
+        "shared_expert_intermediate_size": 5632,
+        "num_experts_per_tok": 4,
+        "num_experts": 60,
+        "norm_topk_prob": False,
+        "output_router_logits": False,
+        "router_aux_loss_coef": 0.001,
+    },
     "stablelm": {
         "architectures": ["StableLmForCausalLM"],
         "bos_token_id": 0,
 
@@ -0,0 +1,130 @@
+"""
+This file specifies how MLC's QWen2 parameter maps from other formats, for example HuggingFace
+PyTorch, HuggingFace safetensors.
+"""
+
+import functools
+
+import numpy as np
+
+from mlc_llm.loader import ExternMapping
+from mlc_llm.quantization import Quantization
+
+from .qwen2_moe_model import Qwen2MoeConfig, Qwen2MoeForCausalLM
+
+
+def huggingface(model_config: Qwen2MoeConfig, quantization: Quantization) -> ExternMapping:
+    """Returns a parameter mapping that maps from the names of MLC LLM parameters to
+    the names of HuggingFace PyTorch parameters.
+
+    Parameters
+    ----------
+    model_config : QWen2Config
+        The configuration of the GPT-2 model.
+
+    quantization : Quantization
+        The quantization configuration.
+
+    Returns
+    -------
+    param_map : ExternMapping
+        The parameter mapping from MLC to HuggingFace PyTorch.
+    """
+    model = Qwen2MoeForCausalLM(model_config)
+    if quantization is not None:
+        model.to(quantization.model_dtype)
+    _, _named_params, _ = model.export_tvm(  # type: ignore[misc]
+        spec=model.get_default_spec(),
+        allow_extern=True,
+    )
+    named_parameters = dict(_named_params)
+
+    mapping = ExternMapping()
+
+    for i in range(model_config.num_hidden_layers):
+        # map attention weight
+        attn = f"model.layers.{i}.self_attn"
+        for weight_type in ["weight", "bias"]:
+            mlc_name = f"{attn}.c_attn.{weight_type}"
+            mlc_param = named_parameters[mlc_name]
+            mapping.add_mapping(
+                mlc_name,
+                [
+                    f"{attn}.q_proj.{weight_type}",
+                    f"{attn}.k_proj.{weight_type}",
+                    f"{attn}.v_proj.{weight_type}",
+                ],
+                functools.partial(
+                    lambda q, k, v, dtype: np.concatenate([q, k, v], axis=0).astype(dtype),
+                    dtype=mlc_param.dtype,
+                ),
+            )
+        # map mlp shared expert weight
+        mlp = f"model.layers.{i}.mlp"
+        shared_expert = f"{mlp}.shared_expert"
+        mlc_name = f"{shared_expert}.gate_up_proj.weight"
+        mlc_param = named_parameters[mlc_name]
+        mapping.add_mapping(
+            mlc_name,
+            [
+                f"{shared_expert}.gate_proj.weight",
+                f"{shared_expert}.up_proj.weight",
+            ],
+            functools.partial(
+                lambda gate, up, dtype: np.concatenate([gate, up], axis=0).astype(dtype),
+                dtype=mlc_param.dtype,
+            ),
+        )
+        # map mlp moe gate and up weight
+        mlc_name = f"{mlp}.moe_gate_up_proj.weight"
+
+        def combine_expert_gate_up(*hf_params, dtype):
+            stack = []
+            for i in range(0, len(hf_params), 2):
+                stack.append(np.concatenate([hf_params[i], hf_params[i + 1]], axis=0))
+            return np.stack(stack, axis=0).astype(dtype)
+
+        mapping.add_mapping(
+            mlc_name,
+            functools.reduce(
+                lambda a, b: a + b,
+                [
+                    [
+                        f"{mlp}.experts.{expert_id}.gate_proj.weight",
+                        f"{mlp}.experts.{expert_id}.up_proj.weight",
+                    ]
+                    for expert_id in range(model_config.num_experts)
+                ],
+            ),
+            functools.partial(
+                combine_expert_gate_up,
+                dtype=mlc_param.dtype,
+            ),
+        )
+
+        # map mlp moe gate and up weight
+        mlc_name = f"{mlp}.moe_down_proj.weight"
+        mlc_param = named_parameters[mlc_name]
+        mapping.add_mapping(
+            mlc_name,
+            [
+                f"{mlp}.experts.{expert_id}.down_proj.weight"
+                for expert_id in range(model_config.num_experts)
+            ],
+            functools.partial(
+                lambda *hf_params, dtype: np.stack(hf_params, axis=0).astype(dtype),
+                dtype=mlc_param.dtype,
+            ),
+        )
+
+    for mlc_name, mlc_param in named_parameters.items():
+        if mlc_name not in mapping.param_map:
+            mapping.add_mapping(
+                mlc_name,
+                [mlc_name],
+                functools.partial(
+                    lambda x, dtype: x.astype(dtype),
+                    dtype=mlc_param.dtype,
+                ),
+            )
+    return mapping