@@ -394,8 +394,7 @@ def split_kv_b_proj(kv_b_proj: torch.Tensor,
394394 p .data .copy_ (module_weights [n ][:])
395395
396396 if self .model_config .quant_config .layer_quant_mode .has_fp8_block_scales (
397- ) and is_sm_100f () and hasattr (
398- module , "weight_scale" ):
397+ ) and is_sm_100f () and hasattr (module , "weight_scale" ):
399398 weight , weight_scale = resmooth_to_fp8_e8m0 (
400399 module .weight , module .weight_scale )
401400 transfromed_scale = transform_sf_into_required_layout (
@@ -787,8 +786,9 @@ def __init__(self,
787786 for key in [EventType .Main , EventType .MoeShared ]
788787 }
789788
790- def _compute_shared_expert_tp_size (self , intermediate_size : int ,
791- block_size : int ) -> int :
789+ def _compute_shared_expert_tp_size (
790+ self , intermediate_size : int ,
791+ block_size : int ) -> tuple [int , float | None ]:
792792 """
793793 In the case of Deepseek-R1, the TP size of MLP is capped by intermediate_size // block_size.
794794 For example, when the intermediate_size is 2048 and block scaling size is 128,
@@ -800,7 +800,9 @@ def _compute_shared_expert_tp_size(self, intermediate_size: int,
800800 it's 128. For NVFP4, it's 16.
801801
802802 Returns:
803- int: The computed tp_size.
803+ tuple[int, float | None]: A tuple containing (shared_tp_size, shared_output_scale).
804+ - shared_tp_size: The computed TP size.
805+ - shared_output_scale: The output scale factor, or None if not needed.
804806 """
805807
806808 assert intermediate_size % block_size == 0 , "intermediate_size must be divisible by block_size."
0 commit comments