|
22 | 22 | from fastdeploy.model_executor.layers.moe.moe import get_moe_scores |
23 | 23 | from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase |
24 | 24 | from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess |
25 | | -from fastdeploy.model_executor.utils import TensorTracker, set_weight_attrs |
| 25 | +from fastdeploy.model_executor.utils import ( |
| 26 | + TensorTracker, |
| 27 | + free_tensor, |
| 28 | + process_weight_transpose, |
| 29 | + set_weight_attrs, |
| 30 | + weight_fully_copied, |
| 31 | +) |
26 | 32 | from fastdeploy.utils import ceil_div |
27 | 33 |
|
28 | 34 | from .triton_moe_kernels import fused_moe_kernel_paddle |
@@ -69,32 +75,50 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): |
69 | 75 | ] |
70 | 76 | # TODO(bukejiyu): remove v1 loader check when v0 loader is removed |
71 | 77 | if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1": |
| 78 | + if self.model_format != "torch": |
| 79 | + up_gate_proj_weight_shape = self.up_gate_proj_weight_shape |
| 80 | + down_proj_weight_shape = self.down_proj_weight_shape |
| 81 | + up_gate_proj_attrs = { |
| 82 | + **extra_weight_attrs, |
| 83 | + "tensor_track": TensorTracker(shape=up_gate_proj_weight_shape, output_dim=True), |
| 84 | + } |
| 85 | + down_proj_attrs = { |
| 86 | + **extra_weight_attrs, |
| 87 | + "tensor_track": TensorTracker(shape=down_proj_weight_shape, output_dim=False), |
| 88 | + } |
| 89 | + else: |
| 90 | + up_gate_proj_weight_shape = self.up_gate_proj_weight_shape[::-1] |
| 91 | + down_proj_weight_shape = self.down_proj_weight_shape[::-1] |
| 92 | + up_gate_proj_attrs = { |
| 93 | + **extra_weight_attrs, |
| 94 | + "tensor_track": TensorTracker(shape=up_gate_proj_weight_shape, output_dim=False), |
| 95 | + "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}, |
| 96 | + } |
| 97 | + down_proj_attrs = { |
| 98 | + **extra_weight_attrs, |
| 99 | + "tensor_track": TensorTracker(shape=down_proj_weight_shape, output_dim=True), |
| 100 | + "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}, |
| 101 | + } |
| 102 | + |
72 | 103 | layer.up_gate_proj_weight = layer.create_parameter( |
73 | | - shape=self.up_gate_proj_weight_shape, |
| 104 | + shape=up_gate_proj_weight_shape, |
74 | 105 | dtype=layer.weight_dtype, |
75 | 106 | default_initializer=paddle.nn.initializer.Constant(0), |
76 | 107 | ) |
77 | 108 |
|
78 | 109 | layer.down_proj_weight = layer.create_parameter( |
79 | | - shape=self.down_proj_weight_shape, |
| 110 | + shape=down_proj_weight_shape, |
80 | 111 | dtype=layer.weight_dtype, |
81 | 112 | default_initializer=paddle.nn.initializer.Constant(0), |
82 | 113 | ) |
83 | | - extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch" |
84 | 114 |
|
85 | 115 | set_weight_attrs( |
86 | 116 | layer.up_gate_proj_weight, |
87 | | - { |
88 | | - **extra_weight_attrs, |
89 | | - "tensor_track": TensorTracker(shape=layer.up_gate_proj_weight.shape, output_dim=True), |
90 | | - }, |
| 117 | + up_gate_proj_attrs, |
91 | 118 | ) |
92 | 119 | set_weight_attrs( |
93 | 120 | layer.down_proj_weight, |
94 | | - { |
95 | | - **extra_weight_attrs, |
96 | | - "tensor_track": TensorTracker(shape=layer.down_proj_weight.shape, output_dim=False), |
97 | | - }, |
| 121 | + down_proj_attrs, |
98 | 122 | ) |
99 | 123 | else: |
100 | 124 | setattr( |
@@ -181,59 +205,64 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict): |
181 | 205 | @paddle.no_grad() |
182 | 206 | def process_weights_after_loading(self, layer): |
183 | 207 | """ """ |
184 | | - if not self.quant_config.is_checkpoint_bf16: |
185 | | - return |
| 208 | + |
| 209 | + def _process_quantize(weight_idx): |
| 210 | + max_bound = 127 |
| 211 | + # weight |
| 212 | + weight_name = self.added_weight_attrs[weight_id_map[weight_type]] |
| 213 | + # scale |
| 214 | + scale_name = self.added_scale_attrs[weight_id_map[weight_type]] |
| 215 | + |
| 216 | + weight_tensor = getattr(layer, weight_name) |
| 217 | + quanted_weight_scale = weight_tensor.abs().max(axis=1) |
| 218 | + quanted_weight = weight_tensor / quanted_weight_scale[:, None, :] * max_bound |
| 219 | + quanted_weight = paddle.round(quanted_weight).astype("int8") |
| 220 | + quanted_weight_scale = quanted_weight_scale / max_bound |
| 221 | + |
| 222 | + free_tensor(getattr(layer, weight_name)) |
| 223 | + |
| 224 | + # create weight |
| 225 | + setattr( |
| 226 | + layer, |
| 227 | + weight_name, |
| 228 | + layer.create_parameter( |
| 229 | + shape=weight_tensor.shape, |
| 230 | + dtype=quanted_weight.dtype, |
| 231 | + default_initializer=paddle.nn.initializer.Constant(0), |
| 232 | + ), |
| 233 | + ) |
| 234 | + # create scale |
| 235 | + setattr( |
| 236 | + layer, |
| 237 | + scale_name, |
| 238 | + layer.create_parameter( |
| 239 | + shape=quanted_weight_scale.shape, |
| 240 | + dtype=quanted_weight_scale.dtype, |
| 241 | + default_initializer=paddle.nn.initializer.Constant(0), |
| 242 | + ), |
| 243 | + ) |
| 244 | + getattr(layer, weight_name).copy_(quanted_weight, False) |
| 245 | + getattr(layer, scale_name).copy_(quanted_weight_scale, False) |
186 | 246 |
|
187 | 247 | algo = layer.quant_method.quant_config.name() |
188 | 248 | assert algo == "wint8" |
189 | | - max_bound = 127 |
190 | | - weight_id_map = {"gate_up": 0, "down": 1} |
191 | | - if ( |
192 | | - hasattr(layer.up_gate_proj_weight, "tensor_track") |
193 | | - and layer.up_gate_proj_weight.tensor_track is not None |
194 | | - and layer.up_gate_proj_weight.tensor_track.is_fully_copied() |
195 | | - ): |
196 | | - weight_type = "gate_up" |
197 | | - layer.up_gate_proj_weight.tensor_track = None |
| 249 | + if self.quant_config.is_checkpoint_bf16: |
| 250 | + |
| 251 | + weight_id_map = {"gate_up": 0, "down": 1} |
| 252 | + if weight_fully_copied(layer.up_gate_proj_weight): |
| 253 | + weight_type = "gate_up" |
| 254 | + else: |
| 255 | + weight_type = "down" |
| 256 | + |
| 257 | + if self.model_format == "torch": |
| 258 | + # pt model |
| 259 | + unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace( |
| 260 | + "quant_weight", "weight" |
| 261 | + ) |
| 262 | + process_weight_transpose(layer, unquantized_weight_name) |
| 263 | + _process_quantize(weight_id_map[weight_type]) |
198 | 264 | else: |
199 | | - weight_type = "down" |
200 | | - layer.down_proj_weight.tensor_track = None |
201 | | - |
202 | | - # weight |
203 | | - weight_name = self.added_weight_attrs[weight_id_map[weight_type]] |
204 | | - # scale |
205 | | - scale_name = self.added_scale_attrs[weight_id_map[weight_type]] |
206 | | - |
207 | | - weight_tensor = getattr(layer, weight_name) |
208 | | - quanted_weight_scale = weight_tensor.abs().max(axis=1) |
209 | | - quanted_weight = weight_tensor / quanted_weight_scale[:, None, :] * max_bound |
210 | | - quanted_weight = paddle.round(quanted_weight).astype("int8") |
211 | | - quanted_weight_scale = quanted_weight_scale / max_bound |
212 | | - |
213 | | - getattr(layer, weight_name).value().get_tensor()._clear() |
214 | | - |
215 | | - # create weight |
216 | | - setattr( |
217 | | - layer, |
218 | | - weight_name, |
219 | | - layer.create_parameter( |
220 | | - shape=weight_tensor.shape, |
221 | | - dtype=quanted_weight.dtype, |
222 | | - default_initializer=paddle.nn.initializer.Constant(0), |
223 | | - ), |
224 | | - ) |
225 | | - # create scale |
226 | | - setattr( |
227 | | - layer, |
228 | | - scale_name, |
229 | | - layer.create_parameter( |
230 | | - shape=quanted_weight_scale.shape, |
231 | | - dtype=quanted_weight_scale.dtype, |
232 | | - default_initializer=paddle.nn.initializer.Constant(0), |
233 | | - ), |
234 | | - ) |
235 | | - getattr(layer, weight_name).copy_(quanted_weight, False) |
236 | | - getattr(layer, scale_name).copy_(quanted_weight_scale, False) |
| 265 | + return |
237 | 266 |
|
238 | 267 | @paddle.no_grad() |
239 | 268 | def apply( |
|
0 commit comments