refactor linear

realliujiaxu · realliujiaxu · commit 4b069b124b98 · 2025-09-11T10:45:50.000+08:00
Signed-off-by: realliujiaxu &lt;realliujiaxu@163.com&gt;
diff --git a/vllm_ascend/ops/linear.py b/vllm_ascend/ops/linear.py
@@ -63,14 +63,11 @@ def __init__(
         *,
         return_bias: bool = True,
     ):
-        self.comm_group = None
-        if prefix.find("gate_up_proj") != -1 and mlp_tp_enable():
-            self.comm_group = get_mlp_tp_group()
-        else:
+        # if self has attr `tp_size`, this means it has been customized by subclass
+        if not hasattr(self, "tp_size"):
             self.comm_group = get_tp_group()
-
-        self.tp_size = self.comm_group.world_size
-        self.tp_rank = self.comm_group.rank_in_group
+            self.tp_size = self.comm_group.world_size
+            self.tp_rank = self.comm_group.rank_in_group
 
         self.input_size_per_partition = input_size
         self.output_size_per_partition = divide(output_size, self.tp_size)
@@ -81,6 +78,8 @@ def __init__(
                 divide(output_size, self.tp_size)
                 for output_size in self.output_sizes
             ]
+        # skip ColumnParallelLinear.__init__, as it will create weight_loader with default tp group
+        # we will create weight_loader by customized comm group
         AscendLinearBase.__init__(self,
                                   input_size,
                                   output_size,
@@ -138,32 +137,35 @@ def __init__(
         *,
         return_bias: bool = True,
     ):
-        if prefix.find("down_proj") != -1 and mlp_tp_enable():
-            comm_group = get_mlp_tp_group()
-            self.forward_type = "mlp_tp"
-        elif prefix.find("o_proj") != -1 and oproj_tp_enable():
-            comm_group = get_otp_group()
-            self.forward_type = "oproj_tp"
-        elif matmul_allreduce_enable():
-            comm_group = get_tp_group()
-            self.forward_type = "matmul_allreduce"
-            self.hcomm_info = self.get_hcomm_info(comm_group.device_group)
-        elif dense_optim_enable():
-            comm_group = get_tp_group()
-            self.forward_type = "dense_optim"
-        else:
-            comm_group = get_tp_group()
-            self.forward_type = "normal"
-        self.comm_group = comm_group
-
-        self.tp_size = self.comm_group.world_size
-        self.tp_rank = self.comm_group.rank_in_group
+        # if self has attr `tp_size`, this means it has been customized by subclass
+        if not hasattr(self, "tp_size"):
+            if prefix.find("down_proj") != -1 and mlp_tp_enable():
+                comm_group = get_mlp_tp_group()
+                self.forward_type = "mlp_tp"
+            elif prefix.find("o_proj") != -1 and oproj_tp_enable():
+                comm_group = get_otp_group()
+                self.forward_type = "oproj_tp"
+            elif matmul_allreduce_enable():
+                comm_group = get_tp_group()
+                self.forward_type = "matmul_allreduce"
+                self.hcomm_info = self.get_hcomm_info(comm_group.device_group)
+            elif dense_optim_enable():
+                comm_group = get_tp_group()
+                self.forward_type = "dense_optim"
+            else:
+                comm_group = get_tp_group()
+                self.forward_type = "normal"
+            self.comm_group = comm_group
+
+            self.tp_size = self.comm_group.world_size
+            self.tp_rank = self.comm_group.rank_in_group
 
         # Divide the weight matrix along the first dimension.
         self.input_size_per_partition = divide(input_size, self.tp_size)
         self.output_size_per_partition = output_size
         self.output_partition_sizes = [output_size]
 
+        # skip RowParallelLinear.__init__, as it will create weight_loader with default tp group
         AscendLinearBase.__init__(self,
                                   input_size,
                                   output_size,
@@ -368,7 +370,6 @@ def _forward_dense_optim(
             return output
         return output, output_bias
 
-
 class AscendMergedColumnParallelLinear(MergedColumnParallelLinear):
     """Packed linear layers with column parallelism.
 
@@ -526,7 +527,7 @@ def __init__(
         self.output_sizes = [
             self.num_heads * self.head_size * tp_size,  # q_proj
             self.num_kv_heads * self.head_size * tp_size,  # k_proj
-            self.num_kv_heads * self.head_size * tp_size,  # v_proj 
+            self.num_kv_heads * self.head_size * tp_size,  # v_proj
         ]
         AscendColumnParallelLinear.__init__(self,
                                             input_size=input_size,
@@ -593,22 +594,15 @@ def __init__(
         return_bias: bool = True,
         disable_tp: bool = False,
     ):
-        nn.Module.__init__(self)
-
-        # Keep input parameters
-        self.input_size = input_size
-        self.output_size = output_size
-        self.skip_bias_add = skip_bias_add
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-        self.params_dtype = params_dtype
-        self.quant_config = quant_config
-        self.prefix = prefix
-        if quant_config is None:
-            self.quant_method: Optional[
-                QuantizeMethodBase] = UnquantizedLinearMethod()
+        if hasattr(self, "tp_rank") and hasattr(self, "tp_size"):
+            tp_rank = self.tp_rank
+            tp_size = self.tp_size
+            super().__init__(input_size, output_size, skip_bias_add,
+                             params_dtype, quant_config, prefix,
+                             return_bias, disable_tp)
+            self.tp_rank = tp_rank
+            self.tp_size = tp_size
         else:
-            self.quant_method = quant_config.get_quant_method(self,
-                                                              prefix=prefix)
-        self.return_bias = return_bias
-        self.disable_tp = disable_tp
+            super().__init__(input_size, output_size, skip_bias_add,
+                    params_dtype, quant_config, prefix,
+                    return_bias, disable_tp)