Skip to content

Commit 4859f40

Browse files
authored
[Feature] GLM-45-AIR Support Mix Quantization(Dense wfp8afp8 and wint8 triton_moe_backend) (#4051)
1 parent 2056a42 commit 4859f40

File tree

15 files changed

+302
-238
lines changed

15 files changed

+302
-238
lines changed

fastdeploy/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,7 @@ def __init__(
398398
# model for mtp/eagle/draft_model
399399
self.model: Optional[str] = None
400400
# quantization of model
401-
self.quantization: Optional[str] = None
401+
self.quantization: Optional[Dict[str, Any]] = None
402402
# allocate more blocks to prevent mtp from finishing the block earlier than the main model
403403
# Fixed now
404404
self.num_gpu_block_expand_ratio: Optional[float] = 1

fastdeploy/engine/args_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
DeprecatedOptionWarning,
4141
FlexibleArgumentParser,
4242
is_port_available,
43+
parse_quantization,
4344
)
4445

4546

@@ -137,7 +138,7 @@ class EngineArgs:
137138
"""
138139
dynamic load weight strategy
139140
"""
140-
quantization: str = None
141+
quantization: Optional[Dict[str, Any]] = None
141142
guided_decoding_backend: str = "off"
142143
"""
143144
Guided decoding backend.
@@ -538,7 +539,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
538539
)
539540
model_group.add_argument(
540541
"--quantization",
541-
type=str,
542+
type=parse_quantization,
542543
default=EngineArgs.quantization,
543544
help="Quantization name for the model, currently support "
544545
"'wint8', 'wint4',"

fastdeploy/engine/engine.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from __future__ import annotations
1818

19+
import json
1920
import multiprocessing
2021
import os
2122
import re
@@ -484,7 +485,7 @@ def _start_worker_service(self):
484485
f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}"
485486
f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}"
486487
f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}"
487-
f" --quantization {self.cfg.model_config.quantization}"
488+
f" --quantization '{json.dumps(self.cfg.model_config.quantization)}'"
488489
f" --ori_vocab_size {ori_vocab_size}"
489490
f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
490491
f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'"

fastdeploy/model_executor/layers/moe/ep.py

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -28,38 +28,9 @@
2828

2929
import fastdeploy
3030
from fastdeploy.config import MoEPhase
31+
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
3132
from fastdeploy.utils import singleton
3233

33-
try:
34-
from fastdeploy.model_executor.ops.gpu import noaux_tc
35-
except:
36-
logger.warning("import noaux_tc Failed!")
37-
38-
39-
def get_moe_scores(
40-
gating_output: paddle.Tensor,
41-
n_group,
42-
topk_group,
43-
top_k,
44-
routed_scaling_factor,
45-
e_score_correction_bias,
46-
) -> paddle.Tensor:
47-
"""
48-
compute moe scores using e_score_correction_bias.
49-
"""
50-
scores = paddle.nn.functional.sigmoid(gating_output)
51-
assert e_score_correction_bias is not None, "e_score_correction_bias is none!"
52-
scores_with_bias = scores + e_score_correction_bias
53-
scores, topk_values, topk_idx = noaux_tc(
54-
scores,
55-
scores_with_bias,
56-
n_group if n_group > 0 else 1,
57-
topk_group if topk_group > 0 else 1,
58-
top_k,
59-
routed_scaling_factor,
60-
)
61-
return scores, topk_values, topk_idx
62-
6334

6435
@singleton
6536
class DeepEPEngine:

fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py

Lines changed: 2 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,7 @@
2727
from .fused_moe_backend_base import UnquantizedFusedMoEMethod
2828

2929
if current_platform.is_cuda():
30-
from fastdeploy.model_executor.ops.gpu import (
31-
moe_expert_dispatch,
32-
moe_expert_reduce,
33-
noaux_tc,
34-
)
30+
from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch, moe_expert_reduce
3531

3632
try:
3733
from fastdeploy.model_executor.ops.gpu import w4afp8_gemm_scale_permute
@@ -43,34 +39,10 @@
4339
moe_expert_reduce,
4440
)
4541

42+
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
4643
from fastdeploy.model_executor.utils import TensorTracker, free_tensor, set_weight_attrs
4744

4845

49-
# used for deepseek_v3
50-
def get_moe_scores(
51-
gating_output: paddle.Tensor,
52-
n_group,
53-
topk_group,
54-
top_k,
55-
routed_scaling_factor,
56-
e_score_correction_bias,
57-
) -> paddle.Tensor:
58-
"""
59-
compute moe scores using e_score_correction_bias.
60-
"""
61-
scores = paddle.nn.functional.sigmoid(gating_output)
62-
scores_with_bias = scores + e_score_correction_bias
63-
scores, topk_values, topk_idx = noaux_tc(
64-
scores,
65-
scores_with_bias,
66-
n_group,
67-
topk_group,
68-
top_k,
69-
routed_scaling_factor,
70-
)
71-
return scores, topk_values, topk_idx
72-
73-
7446
class CutlassMoEMethod(UnquantizedFusedMoEMethod):
7547
"""
7648
Use Cutlass Group Gemm to compute Fused MoE.

fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ def apply_tp(
481481
gate_out = gate(x.cast("float32"))
482482

483483
if layer.topk_method == "noaux_tc":
484-
from .ep import get_moe_scores
484+
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
485485

486486
_, topk_weights, topk_ids = get_moe_scores(
487487
gate_out,

fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,39 +19,15 @@
1919

2020
import fastdeploy
2121
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
22+
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
2223
from fastdeploy.model_executor.ops.gpu import (
2324
MoeWna16MarlinGemmApi,
24-
noaux_tc,
2525
tritonmoe_preprocess_func,
2626
)
2727

2828
from ..quantization.quant_base import QuantMethodBase
2929

3030

31-
def get_moe_scores(
32-
gating_output: paddle.Tensor,
33-
n_group,
34-
topk_group,
35-
top_k,
36-
routed_scaling_factor,
37-
e_score_correction_bias,
38-
) -> paddle.Tensor:
39-
"""
40-
compute moe scores using e_score_correction_bias.
41-
"""
42-
scores = paddle.nn.functional.sigmoid(gating_output)
43-
scores_with_bias = scores + e_score_correction_bias.unsqueeze(0)
44-
scores, topk_values, topk_idx = noaux_tc(
45-
scores,
46-
scores_with_bias,
47-
n_group,
48-
topk_group,
49-
top_k,
50-
routed_scaling_factor,
51-
)
52-
return scores, topk_values, topk_idx
53-
54-
5531
def gptq_marlin_moe_repack(
5632
b_q_weight: paddle.Tensor,
5733
perm: paddle.Tensor,

0 commit comments

Comments
 (0)