PaddlePaddle
diff --git a/‎fastdeploy/model_executor/layers/linear.py‎
Lines changed: 43 additions & 23 deletions b/‎fastdeploy/model_executor/layers/linear.py‎
Lines changed: 43 additions & 23 deletions
diff --git a/‎fastdeploy/model_executor/layers/lm_head.py‎
Lines changed: 46 additions & 10 deletions b/‎fastdeploy/model_executor/layers/lm_head.py‎
Lines changed: 46 additions & 10 deletions
@@ -25,6 +25,7 @@
 from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase
 from fastdeploy.model_executor.utils import (
     default_weight_loader,
+    process_weight_transpose,
     set_weight_attrs,
     slice_fn,
 )
@@ -43,24 +44,36 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
         - output_dim: determines whether the split is applied along the output dimension (rows) or input dimension (columns)
         - weight_loader: a callable or method responsible for loading the weight data
         """
+        self.model_format = extra_weight_attrs.get("model_format")
+        self.weight_shape = (
+            layer.weight_shape[::-1] if extra_weight_attrs.get("model_format") == "torch" else layer.weight_shape
+        )
+
         layer.weight = layer.create_parameter(
-            shape=layer.weight_shape,
+            shape=self.weight_shape,
             dtype=layer.weight_dtype,
             is_bias=False,
             default_initializer=paddle.nn.initializer.Constant(0),
         )
         split_axis = extra_weight_attrs.get("split_axis")
         if hasattr(layer, "nranks") and layer.nranks > 0:
             _set_var_distributed(layer.weight, split_axis=split_axis)
+
+        if self.model_format == "torch" and "output_dim" in extra_weight_attrs:
+            extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"]
+
         set_weight_attrs(
             layer.weight,
             {
                 **extra_weight_attrs,
                 "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
-                "weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
             },
         )
 
+    def process_weights_after_loading(self, layer):
+        if self.model_format == "torch":
+            process_weight_transpose(layer, "weight")
+
     def process_loaded_weights(self, layer, weights) -> None:
         # mlp.gate.weight is precision-sensitive, so we cast it to float32 for computation
         if layer.weight.dtype != weights.dtype:
@@ -165,7 +178,7 @@ def __init__(
         if self.with_bias:
             self.bias = self.create_parameter(
                 shape=[self.output_size],
-                dtype=self._dtype,
+                dtype=self.weight_dtype,
                 is_bias=True,
             )
             setattr(
@@ -262,6 +275,7 @@ def __init__(
         skip_quant: bool = False,
         weight_dtype: str = "",
         weight_key: str = "",
+        model_format: Optional[str] = None,
     ):
         """
         Initializes a replicated linear layer.
@@ -296,7 +310,7 @@ def __init__(
             weight_loader=(
                 self.weight_loader if hasattr(self, "weight_loader") else default_weight_loader(self.fd_config)
             ),
-            model_format=fd_config.model_config.model_format,
+            model_format=fd_config.model_config.model_format if model_format is None else model_format,
         )
 
 
@@ -344,7 +358,6 @@ def __init__(
 
     def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
         weight_need_transpose = getattr(param, "weight_need_transpose", False)
-        loaded_weight = get_tensor(loaded_weight)
 
         if weight_need_transpose:
             loaded_weight = loaded_weight.transpose([1, 0])
@@ -393,7 +406,7 @@ def __init__(
         with_bias: bool = False,
         add_bias: bool = False,
         skip_quant: bool = False,
-        weight_dtype="",
+        weight_dtype: str = "",
     ):
         """
         Initializes a linear layer and provides additional parameters required for inference and quantization.
@@ -500,7 +513,6 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
         output_size = param.shape[shard_dim]
         if loaded_shard_id is None:
             if weight_need_transpose:
-                loaded_weight = get_tensor(loaded_weight)
                 loaded_weight = loaded_weight.transpose([1, 0])
                 # Avoid redundant transpose of fused weights when weight_loader is called iteratively
                 param.weight_need_transpose = False
@@ -519,7 +531,6 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
             # split gate up
             assert loaded_shard_id in ["gate", "up"]
             if weight_need_transpose:
-                loaded_weight = get_tensor(loaded_weight)
                 loaded_weight = loaded_weight.transpose([1, 0])
             # Tensor parallelism splits the weight along the output_dim
             if self.nranks != 1:
@@ -532,7 +543,6 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
                 shard_offset = self.local_rank * block_size
                 shard_size = (self.local_rank + 1) * block_size
                 loaded_weight = slice_fn(loaded_weight, output_dim, start=shard_offset, end=shard_size)
-            loaded_weight = get_tensor(loaded_weight)
             if not param._is_initialized():
                 param.initialize()
             param_shard_size = output_size // 2
@@ -589,7 +599,19 @@ class QKVParallelLinear(ColumnParallelLinear):
     QKVParallelLinear Layer.
     """
 
-    def __init__(self, fd_config, prefix, with_bias=False, add_bias=True):
+    def __init__(
+        self,
+        fd_config,
+        prefix,
+        with_bias=False,
+        add_bias=True,
+        num_heads: Optional[int] = None,
+        kv_num_heads: Optional[int] = None,
+        hidden_size: Optional[int] = None,
+        head_dim: Optional[int] = None,
+        skip_quant: bool = False,
+        weight_dtype: str = "",
+    ):
         """
         Initialize the QKV Linear layer with given parameters.
 
@@ -599,11 +621,15 @@ def __init__(self, fd_config, prefix, with_bias=False, add_bias=True):
                 Can be arbitrarily named.
             with_bias (bool): Whether to include bias or not. Defaults to False.
             add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to True.
+            num_heads (Optional[int]): Number of attention heads in the model.
+            kv_num_heads (Optional[int]): Number of key/value heads, used for multi-query or grouped-query attention.
+            hidden_size (Optional[int]): Total hidden layer dimension, typically the embedding size.
+            head_dim (Optional[int]): Size of each attention head, usually computed as hidden_size divided by num_heads.
         """
-        self.num_heads = fd_config.model_config.num_attention_heads
-        self.kv_num_heads = fd_config.model_config.num_key_value_heads
-        self.hidden_size = fd_config.model_config.hidden_size
-        self.head_dim = fd_config.model_config.head_dim
+        self.num_heads = fd_config.model_config.num_attention_heads if num_heads is None else num_heads
+        self.kv_num_heads = fd_config.model_config.num_key_value_heads if kv_num_heads is None else kv_num_heads
+        self.hidden_size = fd_config.model_config.hidden_size if hidden_size is None else hidden_size
+        self.head_dim = fd_config.model_config.head_dim if head_dim is None else head_dim
         self.nranks = fd_config.parallel_config.tensor_parallel_size
         self.local_rank = fd_config.parallel_config.tensor_parallel_rank
         self.num_heads_per_rank = divide(self.num_heads, self.nranks)
@@ -623,6 +649,8 @@ def __init__(self, fd_config, prefix, with_bias=False, add_bias=True):
             output_size=output_size,
             with_bias=with_bias,
             add_bias=add_bias,
+            skip_quant=skip_quant,
+            weight_dtype=weight_dtype,
         )
 
     def _get_shard_size_mapping(self, loaded_shard_id: str, head_dim: int):
@@ -641,7 +669,6 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
         weight_need_transpose = getattr(param, "weight_need_transpose", False)
         if loaded_shard_id is None:
             if weight_need_transpose:
-                loaded_weight = get_tensor(loaded_weight)
                 loaded_weight = loaded_weight.transpose([1, 0])
                 # Avoid redundant transpose of fused weights when weight_loader is called iteratively
                 param.weight_need_transpose = False
@@ -661,7 +688,6 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
             # split q k v
             assert loaded_shard_id in ["q", "k", "v"]
             if weight_need_transpose:
-                loaded_weight = get_tensor(loaded_weight)
                 loaded_weight = loaded_weight.transpose([1, 0])
             # Tensor parallelism splits the weight along the output_dim
             if self.nranks != 1:
@@ -671,8 +697,6 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
                 shard_size = block_size
                 loaded_weight = slice_fn(loaded_weight, output_dim, start=shard_offset, end=shard_offset + shard_size)
 
-            loaded_weight = get_tensor(loaded_weight)
-
             if not param._is_initialized():
                 param.initialize()
 
@@ -798,7 +822,7 @@ def __init__(
         add_bias: bool = False,
         reduce_results: bool = True,
         skip_quant: bool = False,
-        weight_dtype="",
+        weight_dtype: str = "",
     ):
         """
         Initialize a linear layer with additional parameters for inference and quantization.
@@ -847,10 +871,6 @@ def __init__(
             ),
             model_format=fd_config.model_config.model_format,
         )
-        if self.nranks > 0:
-            if self.with_bias:
-                # col parallel
-                _set_var_distributed(self.bias, split_axis=0)
 
         self.reduce_results = reduce_results
 
 
@@ -28,6 +28,7 @@
 )
 from fastdeploy.model_executor.utils import (
     default_weight_loader,
+    free_tensor,
     set_weight_attrs,
     temporary_dtype,
 )
@@ -69,6 +70,7 @@ def __init__(
             self.bias_key: Optional[str] = prefix + ".bias"
         else:
             self.bias_key: Optional[str] = None
+        self.embedding_dim = embedding_dim
         self.tp_group = fd_config.parallel_config.tp_group
         self.column_cut = True
         self.nranks = fd_config.parallel_config.tensor_parallel_size
@@ -77,34 +79,51 @@ def __init__(
 
         if num_embeddings % self.nranks != 0:
             num_embeddings = pad_vocab_size(num_embeddings, self.padding_size)
+        self.num_embeddings = num_embeddings
+        self.model_format = fd_config.model_config.model_format
 
         ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear
         RowParallelLinear = fleet.meta_parallel.RowParallelLinear
         self.dtype = "float32" if fd_config.model_config.lm_head_fp32 else dtype
 
         self.tie_word_embeddings: bool = fd_config.model_config.tie_word_embeddings
+        self.need_gather = True
 
         with temporary_dtype(self.dtype):
-            if self.column_cut:
-                need_gather = True
+            if self.fd_config.load_config.load_choices == "default_v1" and self.model_format == "torch":
+                self.linear = RowParallelLinear(
+                    num_embeddings,
+                    embedding_dim,
+                    mp_group=self.tp_group,
+                    weight_attr=None,
+                    has_bias=True if self.bias_key is not None else False,
+                    input_is_parallel=False,
+                    fuse_matmul_bias=False,
+                )
+                set_weight_attrs(
+                    self.linear.weight,
+                    {
+                        "weight_loader": default_weight_loader(self.fd_config),
+                    },
+                )
+                set_weight_attrs(self.linear.weight, {"output_dim": False})
+            elif self.column_cut:
                 self.linear = ColumnParallelLinear(
                     embedding_dim,
                     num_embeddings,
                     mp_group=self.tp_group,
                     weight_attr=None,
                     has_bias=True if self.bias_key is not None else False,
-                    gather_output=need_gather,
+                    gather_output=self.need_gather,
                     fuse_matmul_bias=False,
                 )
                 set_weight_attrs(
                     self.linear.weight,
                     {
                         "weight_loader": default_weight_loader(self.fd_config),
-                        "weight_need_transpose": self.fd_config.model_config.model_format == "torch",
                     },
                 )
-                if self.nranks > 1:
-                    set_weight_attrs(self.linear.weight, {"output_dim": True})
+                set_weight_attrs(self.linear.weight, {"output_dim": True})
             else:
                 self.linear = RowParallelLinear(
                     embedding_dim,
@@ -119,12 +138,29 @@ def __init__(
                     self.linear.weight,
                     {
                         "weight_loader": default_weight_loader(self.fd_config),
-                        "weight_need_transpose": self.fd_config.model_config.model_format == "torch",
                     },
                 )
-
-                if self.nranks > 1:
-                    set_weight_attrs(self.linear.weight, {"output_dim": False})
+                set_weight_attrs(self.linear.weight, {"output_dim": False})
+
+    def process_weights_after_loading(self):
+        if self.model_format != "torch":
+            return
+        if not self.linear.weight._is_initialized():
+            self.linear.weight.initialize()
+        weight_transpose = self.linear.weight.transpose([1, 0])
+        with temporary_dtype(self.dtype):
+            linear = fleet.meta_parallel.ColumnParallelLinear(
+                self.embedding_dim,
+                self.num_embeddings,
+                mp_group=self.tp_group,
+                weight_attr=None,
+                has_bias=True if self.bias_key is not None else False,
+                gather_output=self.need_gather,
+                fuse_matmul_bias=False,
+            )
+        linear.weight.set_value(weight_transpose)
+        free_tensor(self.linear.weight)
+        self.linear = linear
 
     def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]):
         """