[Metax] support default_v1 loader based #4988

StareAtYou · StareAtYou · commit ff3b7b27f93b · 2025-11-13T07:27:42.000Z
diff --git a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py
@@ -23,14 +23,21 @@
 from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
 from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import MoEMethodBase
 from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
+from fastdeploy.model_executor.layers.quantization.weight_only import WeightOnlyConfig
 from fastdeploy.model_executor.layers.utils import get_tensor
 from fastdeploy.model_executor.ops.gpu import (
     fused_expert_moe,
     moe_expert_dispatch,
     moe_expert_ffn,
     moe_expert_reduce,
 )
-from fastdeploy.model_executor.utils import TensorTracker, free_tensor, set_weight_attrs
+from fastdeploy.model_executor.utils import (
+    TensorTracker,
+    free_tensor,
+    process_weight_transpose,
+    set_weight_attrs,
+    weight_fully_copied,
+)
 
 
 class MetaxCutlassMoEMethod(MoEMethodBase):
@@ -142,18 +149,11 @@ def apply_tp(
                 1.0,
             )
         else:
-            added_weight_attrs0 = getattr(layer, self.added_weight_attrs[0])
-            added_weight_attrs1 = getattr(layer, self.added_weight_attrs[1])
-
-            if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
-                added_weight_attrs0 = paddle.transpose(added_weight_attrs0, perm=[0, 2, 1])
-                added_weight_attrs1 = paddle.transpose(added_weight_attrs1, perm=[0, 2, 1])
-
             fused_moe_out = fused_expert_moe(
                 x,
                 gate.weight,
-                added_weight_attrs0,
-                added_weight_attrs1,
+                getattr(layer, self.added_weight_attrs[0]),
+                getattr(layer, self.added_weight_attrs[1]),
                 None,
                 (layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
                 None,
@@ -177,7 +177,10 @@ class MetaxCutlassWeightOnlyMoEMethod(MetaxCutlassMoEMethod):
 
     def __init__(self, quant_config):
         super().__init__(quant_config)
-        self.quant_config = quant_config
+        if quant_config is None:
+            self.quant_config = WeightOnlyConfig(algo="weight_only_int8", is_checkpoint_bf16=True)
+        else:
+            self.quant_config = quant_config
         self.moe_quant_type = self.quant_config.algo
         self.pack_num = 1
         self.weight_only_linear_arch = os.getenv("FLAGS_weight_only_linear_arch")
@@ -252,33 +255,61 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
             ]
         self.up_gate_proj_scale_shape = [layer.num_local_experts, layer.moe_intermediate_size * 2]
         self.down_proj_scale_shape = [layer.num_local_experts, layer.hidden_size]
+        self.model_format = extra_weight_attrs.get("model_format")
         # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
         if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
+            if self.model_format != "torch":
+                up_gate_proj_weight_shape = [
+                    layer.num_local_experts,
+                    layer.hidden_size,
+                    layer.moe_intermediate_size * 2,
+                ]
+                down_proj_weight_shape = [layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size]
+                up_gate_proj_attrs = {
+                    **extra_weight_attrs,
+                    "tensor_track": TensorTracker(shape=up_gate_proj_weight_shape, output_dim=True),
+                }
+                down_proj_attrs = {
+                    **extra_weight_attrs,
+                    "tensor_track": TensorTracker(shape=down_proj_weight_shape, output_dim=False),
+                }
+            else:
+                up_gate_proj_weight_shape = [
+                    layer.num_local_experts,
+                    layer.moe_intermediate_size * 2,
+                    layer.hidden_size,
+                ]
+                down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size]
+                up_gate_proj_attrs = {
+                    **extra_weight_attrs,
+                    "tensor_track": TensorTracker(shape=up_gate_proj_weight_shape, output_dim=False),
+                    "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0},
+                }
+                down_proj_attrs = {
+                    **extra_weight_attrs,
+                    "tensor_track": TensorTracker(shape=down_proj_weight_shape, output_dim=True),
+                    "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0},
+                }
+
             layer.up_gate_proj_weight = layer.create_parameter(
-                shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
+                shape=up_gate_proj_weight_shape,
                 dtype=layer.weight_dtype,
                 default_initializer=paddle.nn.initializer.Constant(0),
             )
 
             layer.down_proj_weight = layer.create_parameter(
-                shape=[layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size],
+                shape=down_proj_weight_shape,
                 dtype=layer.weight_dtype,
                 default_initializer=paddle.nn.initializer.Constant(0),
             )
-            extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
+            # extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
             set_weight_attrs(
                 layer.up_gate_proj_weight,
-                {
-                    **extra_weight_attrs,
-                    "tensor_track": TensorTracker(shape=layer.up_gate_proj_weight.shape, output_dim=True),
-                },
+                up_gate_proj_attrs,
             )
             set_weight_attrs(
                 layer.down_proj_weight,
-                {
-                    **extra_weight_attrs,
-                    "tensor_track": TensorTracker(shape=layer.down_proj_weight.shape, output_dim=False),
-                },
+                down_proj_attrs,
             )
         else:
             self.weight_dtype = "int8"
@@ -325,7 +356,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
                     default_initializer=paddle.nn.initializer.Constant(0),
                 ),
             )
-            extra_weight_attrs["weight_need_transpose"] = not extra_weight_attrs.get("model_format") == "torch"
+            # extra_weight_attrs["weight_need_transpose"] = not extra_weight_attrs.get("model_format") == "torch"
             moe_extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
             set_weight_attrs(layer.up_gate_proj_weight, moe_extra_weight_attrs)
             set_weight_attrs(layer.down_proj_weight, moe_extra_weight_attrs)
@@ -337,69 +368,71 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
             set_weight_attrs(layer.down_proj_weight_scale, scale_extra_weight_attrs)
 
     def process_weights_after_loading(self, layer):
-        """ """
-        if not self.quant_config.is_checkpoint_bf16:
-            return
-        weight_id_map = {"gate_up": 0, "down": 1}
-        if (
-            hasattr(layer.up_gate_proj_weight, "tensor_track")
-            and layer.up_gate_proj_weight.tensor_track is not None
-            and layer.up_gate_proj_weight.tensor_track.is_fully_copied()
-        ):
-            weight_type = "gate_up"
-        else:
-            weight_type = "down"
-
-        # 1.init shape and type
-        # weight
-        weight_name = self.added_weight_attrs[weight_id_map[weight_type]]
-        unquantized_weight_name = weight_name.replace("quant_weight", "weight")
-        weight_shape = self.up_gate_proj_weight_shape if weight_type == "gate_up" else self.down_proj_weight_shape
-        weight_shape[1], weight_shape[2] = weight_shape[2], weight_shape[1]
-        weight_dtype = "int8"
-        # scale
-        scale_name = self.added_scale_attrs[weight_id_map[weight_type]]
-        scale_shape = self.up_gate_proj_scale_shape if weight_type == "gate_up" else self.down_proj_scale_shape
-        scale_dtype = self.default_dtype
-
-        # 2.crate tmp tensor
-
-        weight = paddle.empty(weight_shape, dtype=weight_dtype)
-        scale = paddle.empty(scale_shape, dtype=scale_dtype)
-
-        # 3.quantize weight
-
-        for expert_id in range(layer.num_local_experts):
-            weight[expert_id], scale[expert_id] = weight_quantize(
-                getattr(layer, unquantized_weight_name)[expert_id],
-                algo=self.moe_quant_type,
-                arch=self.weight_only_linear_arch,
-            )
+        def _process_quantize(weight_idx):
+            # 1.init shape and type
+            weight_name = self.added_weight_attrs[weight_idx]
+            unquantized_weight_name = weight_name.replace("quant_weight", "weight")
+            weight_shape = self.up_gate_proj_weight_shape if weight_type == "gate_up" else self.down_proj_weight_shape
+            transposed_weight_shape = [weight_shape[0], weight_shape[2], weight_shape[1]]
+            weight_dtype = "int8"
+            # scale
+            scale_name = self.added_scale_attrs[weight_idx]
+            scale_shape = self.up_gate_proj_scale_shape if weight_type == "gate_up" else self.down_proj_scale_shape
+            scale_dtype = self.default_dtype
+
+            # 2.crate tmp tensor
+
+            weight = paddle.empty(transposed_weight_shape, dtype=weight_dtype)
+            scale = paddle.empty(scale_shape, dtype=scale_dtype)
+
+            # 3.quantize weight
+
+            for expert_id in range(layer.num_local_experts):
+                weight[expert_id], scale[expert_id] = weight_quantize(
+                    getattr(layer, unquantized_weight_name)[expert_id],
+                    algo=self.moe_quant_type,
+                    arch=self.weight_only_linear_arch,
+                )
 
-        free_tensor(getattr(layer, unquantized_weight_name))
+            free_tensor(getattr(layer, unquantized_weight_name))
 
-        # create weight
-        setattr(
-            layer,
-            weight_name,
-            layer.create_parameter(
-                shape=weight_shape,
-                dtype=weight_dtype,
-                default_initializer=paddle.nn.initializer.Constant(0),
-            ),
-        )
-        # create scale
-        setattr(
-            layer,
-            scale_name,
-            layer.create_parameter(
-                shape=scale_shape,
-                dtype=scale_dtype,
-                default_initializer=paddle.nn.initializer.Constant(0),
-            ),
-        )
-        getattr(layer, weight_name).copy_(weight, False)
-        getattr(layer, scale_name).copy_(scale, False)
+            setattr(
+                layer,
+                weight_name,
+                layer.create_parameter(
+                    shape=weight_shape,
+                    dtype=weight_dtype,
+                    default_initializer=paddle.nn.initializer.Constant(0),
+                ),
+            )
+            # create scale
+            setattr(
+                layer,
+                scale_name,
+                layer.create_parameter(
+                    shape=scale_shape,
+                    dtype=scale_dtype,
+                    default_initializer=paddle.nn.initializer.Constant(0),
+                ),
+            )
+            getattr(layer, weight_name).copy_(weight.transpose([0, 2, 1]), False)
+            getattr(layer, scale_name).copy_(scale, False)
+
+        if self.quant_config.is_checkpoint_bf16:
+            weight_id_map = {"gate_up": 0, "down": 1}
+            if weight_fully_copied(layer.up_gate_proj_weight):
+                weight_type = "gate_up"
+            else:
+                weight_type = "down"
+
+            if self.model_format == "torch":
+                unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
+                    "quant_weight", "weight"
+                )
+                process_weight_transpose(layer, unquantized_weight_name)
+            _process_quantize(weight_id_map[weight_type])
+        else:
+            return
 
     def process_loaded_weights(self, layer: nn.Layer, state_dict):
         """
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
@@ -62,7 +62,7 @@ def get_moe_method():
             MetaxCutlassWeightOnlyMoEMethod,
         )
 
-        return MetaxCutlassWeightOnlyMoEMethod(None)
+        return MetaxCutlassWeightOnlyMoEMethod
     raise NotImplementedError
 
 
@@ -227,7 +227,7 @@ def weight_loader(
             return
         if hasattr(param, "SHARD_ID_TO_SHARDED_DIM"):
             SHARD_ID_TO_SHARDED_DIM = param.SHARD_ID_TO_SHARDED_DIM
-        elif current_platform.is_cuda() or current_platform.is_iluvatar():
+        elif current_platform.is_cuda() or current_platform.is_iluvatar() or current_platform.is_maca():
             SHARD_ID_TO_SHARDED_DIM = {"gate": 1, "down": 0, "up": 1}
         else:
             SHARD_ID_TO_SHARDED_DIM = {"gate": 0, "down": 1, "up": 0}