From 44e54b3d6115e442ca897bd096180ebaf41f8aed Mon Sep 17 00:00:00 2001
From: Shu Wang <shuw@nvidia.com>
Date: Tue, 5 Aug 2025 12:47:25 +0000
Subject: [PATCH 1/4] Fix num_experts in modelopt_quant

---
 python/sglang/srt/layers/quantization/modelopt_quant.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
index fca0ee38b9c0..d6ca0f380bd1 100755
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -8,6 +8,7 @@
 import torch
 from torch.nn.parameter import Parameter
 
+from sglang.srt.distributed import get_moe_expert_parallel_world_size
 from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
 from sglang.srt.layers.moe.utils import should_use_flashinfer_trtllm_moe
 from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
@@ -764,7 +765,7 @@ def create_weights(
             )
 
         # TODO(ch-wan): check if this is needed
-        layer.num_experts = num_experts
+        layer.num_experts = num_experts * get_moe_expert_parallel_world_size()
         layer.num_local_experts = num_experts
         layer.intermediate_size_per_partition = intermediate_size_per_partition
         layer.params_dtype = params_dtype

From c5a1942fe46d4b64e493b65ea5e33e9c48263953 Mon Sep 17 00:00:00 2001
From: "Shu Wang." <shuw@nvidia.com>
Date: Tue, 5 Aug 2025 19:11:50 +0000
Subject: [PATCH 2/4] Remove verbose print

---
 python/sglang/srt/layers/quantization/modelopt_quant.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
index d6ca0f380bd1..57f470fc8033 100755
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -1107,7 +1107,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 layer.w13_weight_scale,
             )
 
-            print("Applied flashinfer weight processing for both w13 and w2")
+            logger.info_once("Applied flashinfer weight processing for both w13 and w2")
 
         else:
             # CUTLASS processing - handle w13 and w2 separately
@@ -1127,7 +1127,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
 
             # Both flashinfer cutlass and regular cutlass use same processing for w2
-            print("Applied weight processing for both w13 and w2")
+            logger.info_once("Applied weight processing for both w13 and w2")
 
             # Set up CUTLASS MoE parameters
             device = layer.w13_weight.device

From bfa1bfd3ecb3d909e22c45a5686ba2b235a91ba3 Mon Sep 17 00:00:00 2001
From: "Shu Wang." <shuw@nvidia.com>
Date: Tue, 5 Aug 2025 20:13:55 +0000
Subject: [PATCH 3/4] Avoid change nmodification of num_experts in
 create_weights

---
 python/sglang/srt/layers/quantization/modelopt_quant.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
index 57f470fc8033..4e2b3a53e916 100755
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -8,7 +8,6 @@
 import torch
 from torch.nn.parameter import Parameter
 
-from sglang.srt.distributed import get_moe_expert_parallel_world_size
 from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
 from sglang.srt.layers.moe.utils import should_use_flashinfer_trtllm_moe
 from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
@@ -765,8 +764,6 @@ def create_weights(
             )
 
         # TODO(ch-wan): check if this is needed
-        layer.num_experts = num_experts * get_moe_expert_parallel_world_size()
-        layer.num_local_experts = num_experts
         layer.intermediate_size_per_partition = intermediate_size_per_partition
         layer.params_dtype = params_dtype
         layer.quant_config = self.quant_config

From 85e68e2da863bdda9ddda96f05be8afb1a7c7318 Mon Sep 17 00:00:00 2001
From: Shu Wang <shuw@nvidia.com>
Date: Wed, 6 Aug 2025 21:24:58 +0000
Subject: [PATCH 4/4] Fix trtllm_fp4_block_scale_moe API change

---
 python/sglang/srt/layers/moe/fused_moe_triton/layer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
index 56ffe371b5c5..82ec2ea979c3 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -1011,10 +1011,15 @@ def forward(self, hidden_states: torch.Tensor, topk_output):
             gemm1_weights_scale=self.gemm1_scales_fp4_shuffled.data.view(
                 torch.float8_e4m3fn
             ),
+            gemm1_bias=None,
+            gemm1_alpha=None,
+            gemm1_beta=None,
+            gemm1_clamp_limit=None,
             gemm2_weights=self.gemm2_weights_fp4_shuffled.data,
             gemm2_weights_scale=self.gemm2_scales_fp4_shuffled.data.view(
                 torch.float8_e4m3fn
             ),
+            gemm2_bias=None,
             output1_scale_scalar=self.g1_scale_c.data,
             output1_scale_gate_scalar=self.g1_alphas.data,
             output2_scale_scalar=self.g2_alphas.data,