PaddlePaddle
diff --git a/‎fastdeploy/config.py‎
Lines changed: 1 addition & 1 deletion b/‎fastdeploy/config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastdeploy/engine/args_utils.py‎
Lines changed: 3 additions & 2 deletions b/‎fastdeploy/engine/args_utils.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎fastdeploy/engine/engine.py‎
Lines changed: 2 additions & 1 deletion b/‎fastdeploy/engine/engine.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎fastdeploy/model_executor/layers/moe/ep.py‎
Lines changed: 1 addition & 30 deletions b/‎fastdeploy/model_executor/layers/moe/ep.py‎
Lines changed: 1 addition & 30 deletions
diff --git a/‎fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py‎
Lines changed: 2 additions & 30 deletions b/‎fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py‎
Lines changed: 2 additions & 30 deletions
diff --git a/‎fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py‎
Lines changed: 1 addition & 1 deletion b/‎fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py‎
Lines changed: 1 addition & 25 deletions b/‎fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py‎
Lines changed: 1 addition & 25 deletions
@@ -398,7 +398,7 @@ def __init__(
         # model for mtp/eagle/draft_model
         self.model: Optional[str] = None
         # quantization of model
-        self.quantization: Optional[str] = None
+        self.quantization: Optional[Dict[str, Any]] = None
         # allocate more blocks to prevent mtp from finishing the block earlier than the main model
         # Fixed now
         self.num_gpu_block_expand_ratio: Optional[float] = 1
 
@@ -40,6 +40,7 @@
     DeprecatedOptionWarning,
     FlexibleArgumentParser,
     is_port_available,
+    parse_quantization,
 )
 
 
@@ -137,7 +138,7 @@ class EngineArgs:
     """
     dynamic load weight strategy
     """
-    quantization: str = None
+    quantization: Optional[Dict[str, Any]] = None
     guided_decoding_backend: str = "off"
     """
     Guided decoding backend.
@@ -538,7 +539,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         )
         model_group.add_argument(
             "--quantization",
-            type=str,
+            type=parse_quantization,
             default=EngineArgs.quantization,
             help="Quantization name for the model, currently support "
             "'wint8', 'wint4',"
 
@@ -16,6 +16,7 @@
 
 from __future__ import annotations
 
+import json
 import multiprocessing
 import os
 import re
@@ -484,7 +485,7 @@ def _start_worker_service(self):
             f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}"
             f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}"
             f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}"
-            f" --quantization {self.cfg.model_config.quantization}"
+            f" --quantization '{json.dumps(self.cfg.model_config.quantization)}'"
             f" --ori_vocab_size {ori_vocab_size}"
             f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
             f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'"
 
@@ -28,38 +28,9 @@
 
 import fastdeploy
 from fastdeploy.config import MoEPhase
+from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
 from fastdeploy.utils import singleton
 
-try:
-    from fastdeploy.model_executor.ops.gpu import noaux_tc
-except:
-    logger.warning("import noaux_tc Failed!")
-
-
-def get_moe_scores(
-    gating_output: paddle.Tensor,
-    n_group,
-    topk_group,
-    top_k,
-    routed_scaling_factor,
-    e_score_correction_bias,
-) -> paddle.Tensor:
-    """
-    compute moe scores using e_score_correction_bias.
-    """
-    scores = paddle.nn.functional.sigmoid(gating_output)
-    assert e_score_correction_bias is not None, "e_score_correction_bias is none!"
-    scores_with_bias = scores + e_score_correction_bias
-    scores, topk_values, topk_idx = noaux_tc(
-        scores,
-        scores_with_bias,
-        n_group if n_group > 0 else 1,
-        topk_group if topk_group > 0 else 1,
-        top_k,
-        routed_scaling_factor,
-    )
-    return scores, topk_values, topk_idx
-
 
 @singleton
 class DeepEPEngine:
 
@@ -27,11 +27,7 @@
 from .fused_moe_backend_base import UnquantizedFusedMoEMethod
 
 if current_platform.is_cuda():
-    from fastdeploy.model_executor.ops.gpu import (
-        moe_expert_dispatch,
-        moe_expert_reduce,
-        noaux_tc,
-    )
+    from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch, moe_expert_reduce
 
     try:
         from fastdeploy.model_executor.ops.gpu import w4afp8_gemm_scale_permute
@@ -43,34 +39,10 @@
         moe_expert_reduce,
     )
 
+from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
 from fastdeploy.model_executor.utils import TensorTracker, free_tensor, set_weight_attrs
 
 
-# used for deepseek_v3
-def get_moe_scores(
-    gating_output: paddle.Tensor,
-    n_group,
-    topk_group,
-    top_k,
-    routed_scaling_factor,
-    e_score_correction_bias,
-) -> paddle.Tensor:
-    """
-    compute moe scores using e_score_correction_bias.
-    """
-    scores = paddle.nn.functional.sigmoid(gating_output)
-    scores_with_bias = scores + e_score_correction_bias
-    scores, topk_values, topk_idx = noaux_tc(
-        scores,
-        scores_with_bias,
-        n_group,
-        topk_group,
-        top_k,
-        routed_scaling_factor,
-    )
-    return scores, topk_values, topk_idx
-
-
 class CutlassMoEMethod(UnquantizedFusedMoEMethod):
     """
     Use Cutlass Group Gemm to compute Fused MoE.
 
@@ -481,7 +481,7 @@ def apply_tp(
         gate_out = gate(x.cast("float32"))
 
         if layer.topk_method == "noaux_tc":
-            from .ep import get_moe_scores
+            from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
 
             _, topk_weights, topk_ids = get_moe_scores(
                 gate_out,
 
@@ -19,39 +19,15 @@
 
 import fastdeploy
 from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
+from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
 from fastdeploy.model_executor.ops.gpu import (
     MoeWna16MarlinGemmApi,
-    noaux_tc,
     tritonmoe_preprocess_func,
 )
 
 from ..quantization.quant_base import QuantMethodBase
 
 
-def get_moe_scores(
-    gating_output: paddle.Tensor,
-    n_group,
-    topk_group,
-    top_k,
-    routed_scaling_factor,
-    e_score_correction_bias,
-) -> paddle.Tensor:
-    """
-    compute moe scores using e_score_correction_bias.
-    """
-    scores = paddle.nn.functional.sigmoid(gating_output)
-    scores_with_bias = scores + e_score_correction_bias.unsqueeze(0)
-    scores, topk_values, topk_idx = noaux_tc(
-        scores,
-        scores_with_bias,
-        n_group,
-        topk_group,
-        top_k,
-        routed_scaling_factor,
-    )
-    return scores, topk_values, topk_idx
-
-
 def gptq_marlin_moe_repack(
     b_q_weight: paddle.Tensor,
     perm: paddle.Tensor,