From 711d21741cd3a8085e80f68c72e38301a6ac2009 Mon Sep 17 00:00:00 2001 From: Edward Shogulin Date: Thu, 20 Nov 2025 11:14:17 +0000 Subject: [PATCH 01/36] Qwen3 MOE quick fix --- python/sglang/srt/models/qwen3_moe.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/models/qwen3_moe.py b/python/sglang/srt/models/qwen3_moe.py index 9388b974a33b..be655062f6c3 100644 --- a/python/sglang/srt/models/qwen3_moe.py +++ b/python/sglang/srt/models/qwen3_moe.py @@ -68,6 +68,7 @@ ) from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import ( + LazyValue, add_prefix, is_cuda, is_flashinfer_available, @@ -1128,14 +1129,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): else: logger.warning(f"Parameter {name} not found in params_dict") - # TODO mimic deepseek - # Lazy initialization of expert weights cache to avoid slowing down load_weights if not hasattr(self, "routed_experts_weights_of_layer"): - self.routed_experts_weights_of_layer = { - layer_id: self.model.layers[layer_id].mlp.get_moe_weights() - for layer_id in range(self.start_layer, self.end_layer) - if isinstance(self.model.layers[layer_id].mlp, Qwen3MoeSparseMoeBlock) - } + self.routed_experts_weights_of_layer = LazyValue( + lambda: { + layer_id: self.model.layers[layer_id].mlp.get_moe_weights() + for layer_id in range(self.start_layer, self.end_layer) + if isinstance( + self.model.layers[layer_id].mlp, Qwen3MoeSparseMoeBlock + ) + } + ) @classmethod def get_model_config_for_expert_location(cls, config): From 2b24ec3e09240da3d93cbc30bddf6c03c421356c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 26 Dec 2025 18:08:46 +0300 Subject: [PATCH 02/36] Add nz support for MOE --- python/sglang/srt/layers/quantization/unquant.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py index 630b600687b4..9509dce821c2 100644 --- a/python/sglang/srt/layers/quantization/unquant.py +++ b/python/sglang/srt/layers/quantization/unquant.py @@ -25,6 +25,7 @@ get_bool_env_var, is_cpu, is_hip, + is_npu, next_power_of_2, set_weight_attrs, use_intel_amx_backend, @@ -40,6 +41,7 @@ _is_cpu_amx_available = cpu_has_amx_support() _is_hip = is_hip() _is_cpu = is_cpu() +_is_npu = is_npu() _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip if _use_aiter: @@ -47,12 +49,15 @@ from aiter.fused_moe import fused_moe from aiter.ops.shuffle import shuffle_weight +if _is_npu: + import torch_npu + NPU_FORMAT_FRACTAL_NZ = 29 + try: from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe except ImportError: flashinfer_cutlass_fused_moe = None - class UnquantizedEmbeddingMethod(QuantizeMethodBase): """Unquantized method for embeddings.""" @@ -296,6 +301,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.num_local_experts, *new_shape_w2 ) + if _is_npu: + for weight_name in ["w13_weight", "w2_weight"]: + weight = getattr(layer, weight_name) + weight.data = torch_npu.npu_format_cast( + weight.data, NPU_FORMAT_FRACTAL_NZ + ) + return def create_moe_runner( From 2ec286abba8d7b3158bba05592e923d3fdd4aded Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:48:16 +0300 Subject: [PATCH 03/36] Update python/sglang/srt/layers/quantization/unquant.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- python/sglang/srt/layers/quantization/unquant.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py index 9509dce821c2..1789b6dc9c9f 100644 --- a/python/sglang/srt/layers/quantization/unquant.py +++ b/python/sglang/srt/layers/quantization/unquant.py @@ -302,10 +302,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: ) if _is_npu: + from sglang.srt.hardware_backend.npu.utils import NPUACLFormat for weight_name in ["w13_weight", "w2_weight"]: weight = getattr(layer, weight_name) weight.data = torch_npu.npu_format_cast( - weight.data, NPU_FORMAT_FRACTAL_NZ + weight.data, NPUACLFormat.ACL_FORMAT_FRACTAL_NZ ) return From d9a3818d0b9acc4fa321bc38184f4b171acbe7e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:49:09 +0300 Subject: [PATCH 04/36] Update unquant.py --- python/sglang/srt/layers/quantization/unquant.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py index 1789b6dc9c9f..197d0b3296ca 100644 --- a/python/sglang/srt/layers/quantization/unquant.py +++ b/python/sglang/srt/layers/quantization/unquant.py @@ -51,7 +51,6 @@ if _is_npu: import torch_npu - NPU_FORMAT_FRACTAL_NZ = 29 try: from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe From 1ad1ca16868873f768895775326def3d8f9ceba4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:49:47 +0300 Subject: [PATCH 05/36] Update unquant.py --- python/sglang/srt/layers/quantization/unquant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py index 197d0b3296ca..bd21d2c34d8c 100644 --- a/python/sglang/srt/layers/quantization/unquant.py +++ b/python/sglang/srt/layers/quantization/unquant.py @@ -51,6 +51,7 @@ if _is_npu: import torch_npu + from sglang.srt.hardware_backend.npu.utils import NPUACLFormat try: from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe @@ -301,7 +302,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: ) if _is_npu: - from sglang.srt.hardware_backend.npu.utils import NPUACLFormat for weight_name in ["w13_weight", "w2_weight"]: weight = getattr(layer, weight_name) weight.data = torch_npu.npu_format_cast( From e5484c94e4ce3acf34626aaa6dd6a59b793e6476 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:53:13 +0300 Subject: [PATCH 06/36] Fix lint issue --- python/sglang/srt/layers/quantization/unquant.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py index bd21d2c34d8c..b9df37c0ed67 100644 --- a/python/sglang/srt/layers/quantization/unquant.py +++ b/python/sglang/srt/layers/quantization/unquant.py @@ -51,6 +51,7 @@ if _is_npu: import torch_npu + from sglang.srt.hardware_backend.npu.utils import NPUACLFormat try: @@ -58,6 +59,7 @@ except ImportError: flashinfer_cutlass_fused_moe = None + class UnquantizedEmbeddingMethod(QuantizeMethodBase): """Unquantized method for embeddings.""" From 25a0e56560e3de8980b55bac812d32837f5cc678 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 12:47:07 +0300 Subject: [PATCH 07/36] Remove a non-used env ENABLE_ASCEND_MOE_NZ variable from ascend_npu_qwen3_examples.md --- docs/platforms/ascend_npu_qwen3_examples.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/platforms/ascend_npu_qwen3_examples.md b/docs/platforms/ascend_npu_qwen3_examples.md index 958ad8c97398..5278a22a1001 100644 --- a/docs/platforms/ascend_npu_qwen3_examples.md +++ b/docs/platforms/ascend_npu_qwen3_examples.md @@ -62,7 +62,6 @@ export HCCL_BUFFSIZE=1536 export HCCL_OP_EXPANSION_MODE=AIV export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=32 export SGLANG_DEEPEP_BF16_DISPATCH=1 -export ENABLE_ASCEND_MOE_NZ=1 python -m sglang.launch_server \ --device npu \ @@ -84,7 +83,6 @@ export STREAMS_PER_DEVICE=32 export HCCL_BUFFSIZE=1536 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=32 export SGLANG_DEEPEP_BF16_DISPATCH=1 -export ENABLE_ASCEND_MOE_NZ=1 python -m sglang.launch_server \ --model-path Qwen/Qwen3-235B-A22B-Instruct-2507 \ From 61830a2a041c00aab8f2fe7c2b450faab063d5fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 12:48:53 +0300 Subject: [PATCH 08/36] Remove a non-used env ENABLE_MOE_NZ variable from ascend_npu_qwen3_examples.md --- docs/platforms/ascend_npu_deepseek_example.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/platforms/ascend_npu_deepseek_example.md b/docs/platforms/ascend_npu_deepseek_example.md index d75b942704b2..ad9d4bd7a078 100644 --- a/docs/platforms/ascend_npu_deepseek_example.md +++ b/docs/platforms/ascend_npu_deepseek_example.md @@ -22,7 +22,6 @@ export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 #npu acceleration operator export SGLANG_NPU_USE_MLAPO=1 export SGLANG_USE_FIA_NZ=1 -export ENABLE_MOE_NZ=1 python3 -m sglang.launch_server \ --model-path ${MODEL_PATH} \ @@ -71,7 +70,6 @@ export HCCL_BUFFSIZE=1536 #npu acceleration operator export SGLANG_NPU_USE_MLAPO=1 export SGLANG_USE_FIA_NZ=1 -export ENABLE_MOE_NZ=1 export TASK_QUEUE_ENABLE=2 python -m sglang.launch_server \ From f586b40829e0f10f817a57e1d81167c2620d1732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 13:04:55 +0300 Subject: [PATCH 09/36] Update NZ converison --- python/sglang/srt/layers/moe/ep_moe/layer.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index 53c25b069ffb..316ae0ac669d 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -31,6 +31,7 @@ from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod from sglang.srt.utils import get_bool_env_var, is_hip, is_npu +from sglang.srt.hardware_backend.npu.utils import npu_format_cast if TYPE_CHECKING: from sglang.srt.layers.moe.token_dispatcher import ( @@ -472,13 +473,6 @@ def forward( gmm2_weight_scale=self.w2_weight_scale, ).hidden_state - def release_weight_cache(self, weight: torch.Tensor): - # .contiguous() introduces additional memory overhead and needs to be released using resize_(0) - origin_weight = weight.data.transpose(1, 2) - new_weight = origin_weight.contiguous() - origin_weight.untyped_storage().resize_(0) - return new_weight - def permute_w13_weight_scale(self, w: torch.Tensor, tile_n: int): if tile_n % 2 != 0: raise ValueError(f"tile_n must be even, got {tile_n}") @@ -520,14 +514,13 @@ def reshape_w13_weight(self, weight: torch.Tensor, dim: int, chunk_size: int = 6 return weight.view(*original_shape[:dim], -1, *original_shape[dim + 1 :]) def _process_weights_after_loading(self, layer: torch.nn.Module) -> None: - w13 = self.release_weight_cache(layer.w13_weight) - torch_npu.npu_format_cast_(w13, 2) + w13 = layer.w13_weight cpu_w13 = w13.cpu() w13 = self.reshape_w13_weight(cpu_w13, -1).npu() - torch_npu.npu_format_cast_(w13, 29) + w13 = npu_format_cast(w13) layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False) - w2 = torch_npu.npu_format_cast(layer.w2_weight.data, 29) + w2 = npu_format_cast(w2) layer.w2_weight = torch.nn.Parameter(w2, requires_grad=False) w13_scale = layer.w13_weight_scale.data.squeeze(-1).contiguous() From 4d38ade472e907148d60084c00e32ab29d9c301a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 13:30:19 +0300 Subject: [PATCH 10/36] Remove unnecessary function --- .../npu/quantization/fused_moe_method_npu.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index e1b2f6e2b378..d16ea13fe402 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -212,20 +212,7 @@ def create_weights( layer.register_parameter("w2_weight_offset", w2_weight_offset) set_weight_attrs(w2_weight_offset, extra_weight_attrs) - def release_weight_cache(self, weight: torch.Tensor): - # .contiguous() introduces additional memory overhead and needs to be released using resize_(0) - origin_weight = weight.data.transpose(1, 2) - new_weight = origin_weight.contiguous() - origin_weight.untyped_storage().resize_(0) - return new_weight - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - weight_data = self.release_weight_cache(layer.w13_weight.data) - layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False) - - weight_data = self.release_weight_cache(layer.w2_weight.data) - layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False) - layer.w13_weight_scale = torch.nn.Parameter( layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32), requires_grad=False, From 2f4608d69197b8b3add9d82170a72e322cc62b07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 15:19:44 +0300 Subject: [PATCH 11/36] Update layer.py --- python/sglang/srt/layers/moe/ep_moe/layer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index 316ae0ac669d..ce7e15f6900d 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -514,13 +514,12 @@ def reshape_w13_weight(self, weight: torch.Tensor, dim: int, chunk_size: int = 6 return weight.view(*original_shape[:dim], -1, *original_shape[dim + 1 :]) def _process_weights_after_loading(self, layer: torch.nn.Module) -> None: - w13 = layer.w13_weight - cpu_w13 = w13.cpu() + cpu_w13 = layer.w13_weight.cpu() w13 = self.reshape_w13_weight(cpu_w13, -1).npu() w13 = npu_format_cast(w13) layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False) - w2 = npu_format_cast(w2) + w2 = npu_format_cast(layer.w2_weight.data) layer.w2_weight = torch.nn.Parameter(w2, requires_grad=False) w13_scale = layer.w13_weight_scale.data.squeeze(-1).contiguous() From 36992882163a28553f4a54622e89a33679536f83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 15:23:09 +0300 Subject: [PATCH 12/36] Update unquant.py --- python/sglang/srt/layers/quantization/unquant.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py index c980bacaa351..c3637364e8c2 100644 --- a/python/sglang/srt/layers/quantization/unquant.py +++ b/python/sglang/srt/layers/quantization/unquant.py @@ -52,7 +52,7 @@ if _is_npu: import torch_npu - from sglang.srt.hardware_backend.npu.utils import NPUACLFormat + from sglang.srt.hardware_backend.npu.utils import npu_format_cast try: from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe @@ -306,8 +306,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if _is_npu: for weight_name in ["w13_weight", "w2_weight"]: weight = getattr(layer, weight_name) - weight.data = torch_npu.npu_format_cast( - weight.data, NPUACLFormat.ACL_FORMAT_FRACTAL_NZ + weight.data = npu_format_cast( + weight.data, ) return From 3092b3148eb005f76ffa832ad1d5208c8a6d01f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 16:57:16 +0300 Subject: [PATCH 13/36] Update layer.py --- python/sglang/srt/layers/moe/ep_moe/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index ce7e15f6900d..13687dc4646f 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -514,7 +514,7 @@ def reshape_w13_weight(self, weight: torch.Tensor, dim: int, chunk_size: int = 6 return weight.view(*original_shape[:dim], -1, *original_shape[dim + 1 :]) def _process_weights_after_loading(self, layer: torch.nn.Module) -> None: - cpu_w13 = layer.w13_weight.cpu() + cpu_w13 = layer.w13_weight.data.transpose(1, 2).contiguous().cpu() w13 = self.reshape_w13_weight(cpu_w13, -1).npu() w13 = npu_format_cast(w13) layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False) From fe2aed78234288e086eb22314d1362a3737b0197 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 18:31:18 +0300 Subject: [PATCH 14/36] Update layer.py --- python/sglang/srt/layers/moe/ep_moe/layer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index 13687dc4646f..23700047dc2e 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -514,12 +514,12 @@ def reshape_w13_weight(self, weight: torch.Tensor, dim: int, chunk_size: int = 6 return weight.view(*original_shape[:dim], -1, *original_shape[dim + 1 :]) def _process_weights_after_loading(self, layer: torch.nn.Module) -> None: - cpu_w13 = layer.w13_weight.data.transpose(1, 2).contiguous().cpu() + cpu_w13 = layer.w13_weight.transpose(1, 2).cpu() w13 = self.reshape_w13_weight(cpu_w13, -1).npu() w13 = npu_format_cast(w13) layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False) - w2 = npu_format_cast(layer.w2_weight.data) + w2 = npu_format_cast(layer.w2_weight) layer.w2_weight = torch.nn.Parameter(w2, requires_grad=False) w13_scale = layer.w13_weight_scale.data.squeeze(-1).contiguous() From 1054c9d634e1543f13be3050cf6019c04e440e36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 13 Jan 2026 19:03:36 +0300 Subject: [PATCH 15/36] Update fused_moe_method_npu.py --- .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index d16ea13fe402..a9b6584796fd 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -227,8 +227,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False ) - layer.w13_weight.data = npu_format_cast(layer.w13_weight.data) - layer.w2_weight.data = npu_format_cast(layer.w2_weight.data) + layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2)) + layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2)) def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" From da5158b4642af4c8572bde78d465138b9dce8037 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Wed, 14 Jan 2026 11:49:51 +0300 Subject: [PATCH 16/36] Update fused_moe_method_npu.py --- .../npu/quantization/fused_moe_method_npu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index a9b6584796fd..670887425a32 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -213,6 +213,9 @@ def create_weights( set_weight_attrs(w2_weight_offset, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2).contiguous()) + layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2).contiguous()) + layer.w13_weight_scale = torch.nn.Parameter( layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32), requires_grad=False, @@ -227,9 +230,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False ) - layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2)) - layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2)) - def create_moe_runner( self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig" ): From 0162b74db5cd71722959ec165bae7afa52aec5e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Wed, 14 Jan 2026 14:09:59 +0300 Subject: [PATCH 17/36] Update fused_moe_method_npu.py --- .../npu/quantization/fused_moe_method_npu.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 670887425a32..cdbe973827f0 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -217,17 +217,16 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2).contiguous()) layer.w13_weight_scale = torch.nn.Parameter( - layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32), - requires_grad=False, + layer.w13_weight_scale.data.squeeze(-1), requires_grad=False ) layer.w2_weight_scale = torch.nn.Parameter( - layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False + layer.w2_weight_scale.data.squeeze(-1), requires_grad=False ) layer.w13_weight_offset = torch.nn.Parameter( - layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False + layer.w13_weight_offset.data.squeeze(-1), requires_grad=False ) layer.w2_weight_offset = torch.nn.Parameter( - layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False + layer.w2_weight_offset.data.squeeze(-1), requires_grad=False ) def create_moe_runner( From 019e2d6657a7db480ac5c4d5512321d562d1871f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Wed, 14 Jan 2026 15:09:43 +0300 Subject: [PATCH 18/36] Update fused_moe_method_npu.py --- .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index cdbe973827f0..bbd46a584275 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -213,8 +213,8 @@ def create_weights( set_weight_attrs(w2_weight_offset, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2).contiguous()) - layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2).contiguous()) + layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2)) + layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2)) layer.w13_weight_scale = torch.nn.Parameter( layer.w13_weight_scale.data.squeeze(-1), requires_grad=False From d02b451d6cae56162c57f1cdf953adc6d30248f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Wed, 14 Jan 2026 17:30:48 +0300 Subject: [PATCH 19/36] =?UTF-8?q?Create=20test=5Fascend=5Fmemory=5Fconsump?= =?UTF-8?q?tion.py=E2=80=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ..._ascend_memory_consumption.py\342\200\216" | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 "test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" diff --git "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" new file mode 100644 index 000000000000..fe0d9f58d4ff --- /dev/null +++ "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" @@ -0,0 +1,73 @@ +""" +Usage: +python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption +""" + +import os +import unittest + +import torch + +from sglang.srt.utils import kill_process_tree +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + +if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" +DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( + 8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 +) +DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}" + + +class TestMemoryConsumptionAscend(CustomTestCase): + + def test_memory_consumption(self): + + model = "nytopop/Qwen3-30B-A3B.w8a8" + base_url = DEFAULT_URL_FOR_TEST + + ### Calculate initial used memory + free_npu_memory, total_npu_memory = torch.npu.mem_get_info() + initial_used_memory = total_npu_memory - free_npu_memory + + process = popen_launch_server( + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--device", + "npu", + "--attention-backend", + "ascend", + "--tp-size", + "2", + "--mem-fraction-static", + "0.8", + "--cuda-graph-bs", + "1", + "--max-total-tokens", + "1024", + "--disable-radix-cache", + "--disable-cuda-graph", + ], + ) + + ### Calculate initial used memory + free_npu_memory, total_npu_memory = torch.npu.mem_get_info() + used_memory_after_server_starting = ( + total_npu_memory - free_npu_memory - initial_used_memory + ) / (1 << 30) + self.assertLessEqual(float(used_memory_after_server_starting), 17.00) + + # Clean up everything + kill_process_tree(process.pid) + + +if __name__ == "__main__": + unittest.main() From edbba1b185a116838e02d0ae5a7bb71a800efb31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 15 Jan 2026 15:26:33 +0300 Subject: [PATCH 20/36] Fix lint issue --- .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index c44fed1ab957..3387a5848550 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -153,7 +153,6 @@ class NPUW8A8Int8DynamicMoEMethod(_NPUFusedMoEMethodBase): def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2)) layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2)) - layer.w13_weight_scale = torch.nn.Parameter( layer.w13_weight_scale.data.squeeze(-1), requires_grad=False ) From fa13828abf1cf66433d2d77f74cbe7b76582a42a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 15 Jan 2026 15:27:17 +0300 Subject: [PATCH 21/36] Fix lint issue --- python/sglang/srt/layers/moe/ep_moe/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index d34a01425e66..7e8bb33ca70a 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -7,6 +7,7 @@ from sglang.srt.compilation.piecewise_context_manager import is_in_piecewise_cuda_graph from sglang.srt.environ import envs +from sglang.srt.hardware_backend.npu.utils import npu_format_cast from sglang.srt.layers import deep_gemm_wrapper from sglang.srt.layers.moe import ( get_deepep_mode, @@ -31,7 +32,6 @@ from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod from sglang.srt.utils import get_bool_env_var, is_hip, is_npu -from sglang.srt.hardware_backend.npu.utils import npu_format_cast if TYPE_CHECKING: from sglang.srt.layers.moe.token_dispatcher import ( From c78449b6e20adc5144a6957bb8fa62eac50863fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Thu, 15 Jan 2026 15:28:02 +0300 Subject: [PATCH 22/36] Fix lint issue --- python/sglang/srt/layers/quantization/unquant.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py index c3637364e8c2..610e1a848e35 100644 --- a/python/sglang/srt/layers/quantization/unquant.py +++ b/python/sglang/srt/layers/quantization/unquant.py @@ -50,8 +50,6 @@ from aiter.ops.shuffle import shuffle_weight if _is_npu: - import torch_npu - from sglang.srt.hardware_backend.npu.utils import npu_format_cast try: From 5888bd6ce8b823c645b50662bd465f3b288d7c37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 16 Jan 2026 13:42:14 +0300 Subject: [PATCH 23/36] Update fused_moe_method_npu.py --- .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py index 3387a5848550..b3bd7c2155e6 100644 --- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py +++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py @@ -162,12 +162,12 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # Compressed-tensors format doesn't have this field if hasattr(layer, "w13_weight_offset"): layer.w13_weight_offset = torch.nn.Parameter( - layer.w13_weight_offset.data.squeeze(-1).contiguous(), + layer.w13_weight_offset.data.squeeze(-1), requires_grad=False, ) if hasattr(layer, "w2_weight_offset"): layer.w2_weight_offset = torch.nn.Parameter( - layer.w2_weight_offset.data.squeeze(-1).contiguous(), + layer.w2_weight_offset.data.squeeze(-1), requires_grad=False, ) From ab233ad772a66a65f275ba9c405a374d7b99685f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 16 Jan 2026 14:56:25 +0300 Subject: [PATCH 24/36] =?UTF-8?q?Update=20test=5Fascend=5Fmemory=5Fconsump?= =?UTF-8?q?tion.py=E2=80=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- "test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" index fe0d9f58d4ff..cbc290e965b4 100644 --- "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" +++ "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" @@ -63,7 +63,7 @@ class TestMemoryConsumptionAscend(CustomTestCase): used_memory_after_server_starting = ( total_npu_memory - free_npu_memory - initial_used_memory ) / (1 << 30) - self.assertLessEqual(float(used_memory_after_server_starting), 17.00) + self.assertLessEqual(float(used_memory_after_server_starting), 16.00) # Clean up everything kill_process_tree(process.pid) From 5ceeab12c0c5e15c63f0dae65829f333de609e3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Fri, 16 Jan 2026 15:04:04 +0300 Subject: [PATCH 25/36] Update run_suite.py --- test/srt/run_suite.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index fbc7c8154476..b6f3ea25daa4 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -165,6 +165,7 @@ ], "per-commit-2-npu-a2": [ TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400), + TestFile("ascend/test_ascend_memory_consumption.py‎", 400), TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400), TestFile("ascend/test_ascend_tp2_bf16.py", 400), TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400), From b8d8285b2ac951f0c6300cea89b1e0399c24db5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 26 Jan 2026 18:30:54 +0300 Subject: [PATCH 26/36] Move transpose(1,2) from forward_npu() to process_weights --- python/sglang/srt/layers/quantization/unquant.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py index a197c83b8eb9..6ae02aaec67b 100644 --- a/python/sglang/srt/layers/quantization/unquant.py +++ b/python/sglang/srt/layers/quantization/unquant.py @@ -304,6 +304,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if _is_npu: for weight_name in ["w13_weight", "w2_weight"]: weight = getattr(layer, weight_name) + weight.data = weight.data.transpose(1, 2) weight.data = npu_format_cast( weight.data, ) @@ -506,9 +507,6 @@ def forward_npu( expert_tokens = expert_tokens.to(torch.int64) w13_bias = [layer.w13_weight_bias] if self.with_bias else None w2_bias = [layer.w2_weight_bias] if self.with_bias else None - if layer.w13_weight.shape[-1] == layer.hidden_size: - w13 = layer.w13_weight.transpose(1, 2) - w2 = layer.w2_weight.transpose(1, 2) # gmm1: gate_up_proj hidden_states = torch_npu.npu_grouped_matmul( From a80de0b42d9639638aa0898398d7e80ab98383d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Mon, 26 Jan 2026 18:48:25 +0300 Subject: [PATCH 27/36] Quickfix --- python/sglang/srt/layers/quantization/unquant.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py index 6ae02aaec67b..628fadbd166e 100644 --- a/python/sglang/srt/layers/quantization/unquant.py +++ b/python/sglang/srt/layers/quantization/unquant.py @@ -511,7 +511,7 @@ def forward_npu( # gmm1: gate_up_proj hidden_states = torch_npu.npu_grouped_matmul( x=[hidden_states], - weight=[w13], + weight=[layer.w13_weight], bias=w13_bias, split_item=2, group_list_type=0, @@ -535,7 +535,7 @@ def forward_npu( # gmm2: down_proj hidden_states = torch_npu.npu_grouped_matmul( x=[hidden_states], - weight=[w2], + weight=[layer.w2_weight], bias=w2_bias, split_item=2, group_list_type=0, From 47a5d8aa6b38814b78d1473257a72ab68dea0764 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:32:55 +0300 Subject: [PATCH 28/36] =?UTF-8?q?Delete=20test/srt/ascend/test=5Fascend=5F?= =?UTF-8?q?memory=5Fconsumption.py=E2=80=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ..._ascend_memory_consumption.py\342\200\216" | 73 ------------------- 1 file changed, 73 deletions(-) delete mode 100644 "test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" diff --git "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" deleted file mode 100644 index cbc290e965b4..000000000000 --- "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" +++ /dev/null @@ -1,73 +0,0 @@ -""" -Usage: -python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption -""" - -import os -import unittest - -import torch - -from sglang.srt.utils import kill_process_tree -from sglang.test.test_utils import ( - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - CustomTestCase, - popen_launch_server, -) - -if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" -DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( - 8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 -) -DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}" - - -class TestMemoryConsumptionAscend(CustomTestCase): - - def test_memory_consumption(self): - - model = "nytopop/Qwen3-30B-A3B.w8a8" - base_url = DEFAULT_URL_FOR_TEST - - ### Calculate initial used memory - free_npu_memory, total_npu_memory = torch.npu.mem_get_info() - initial_used_memory = total_npu_memory - free_npu_memory - - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--trust-remote-code", - "--device", - "npu", - "--attention-backend", - "ascend", - "--tp-size", - "2", - "--mem-fraction-static", - "0.8", - "--cuda-graph-bs", - "1", - "--max-total-tokens", - "1024", - "--disable-radix-cache", - "--disable-cuda-graph", - ], - ) - - ### Calculate initial used memory - free_npu_memory, total_npu_memory = torch.npu.mem_get_info() - used_memory_after_server_starting = ( - total_npu_memory - free_npu_memory - initial_used_memory - ) / (1 << 30) - self.assertLessEqual(float(used_memory_after_server_starting), 16.00) - - # Clean up everything - kill_process_tree(process.pid) - - -if __name__ == "__main__": - unittest.main() From 87d6963ce4e27c6fb8973fe3d5eec5f023bad6cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:33:21 +0300 Subject: [PATCH 29/36] =?UTF-8?q?Rename=20test=5Fascend=5Fmemory=5Fconsump?= =?UTF-8?q?tion.py=E2=80=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ..._ascend_memory_consumption.py\342\200\216" | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 "test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" diff --git "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" new file mode 100644 index 000000000000..cbc290e965b4 --- /dev/null +++ "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" @@ -0,0 +1,73 @@ +""" +Usage: +python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption +""" + +import os +import unittest + +import torch + +from sglang.srt.utils import kill_process_tree +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + +if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" +DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( + 8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 +) +DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}" + + +class TestMemoryConsumptionAscend(CustomTestCase): + + def test_memory_consumption(self): + + model = "nytopop/Qwen3-30B-A3B.w8a8" + base_url = DEFAULT_URL_FOR_TEST + + ### Calculate initial used memory + free_npu_memory, total_npu_memory = torch.npu.mem_get_info() + initial_used_memory = total_npu_memory - free_npu_memory + + process = popen_launch_server( + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--device", + "npu", + "--attention-backend", + "ascend", + "--tp-size", + "2", + "--mem-fraction-static", + "0.8", + "--cuda-graph-bs", + "1", + "--max-total-tokens", + "1024", + "--disable-radix-cache", + "--disable-cuda-graph", + ], + ) + + ### Calculate initial used memory + free_npu_memory, total_npu_memory = torch.npu.mem_get_info() + used_memory_after_server_starting = ( + total_npu_memory - free_npu_memory - initial_used_memory + ) / (1 << 30) + self.assertLessEqual(float(used_memory_after_server_starting), 16.00) + + # Clean up everything + kill_process_tree(process.pid) + + +if __name__ == "__main__": + unittest.main() From 105e0632eb7196f7ded57a13bcc9a1b397159381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:34:49 +0300 Subject: [PATCH 30/36] =?UTF-8?q?Delete=20test/srt/ascend/test=5Fascend=5F?= =?UTF-8?q?memory=5Fconsumption.py=E2=80=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ..._ascend_memory_consumption.py\342\200\216" | 73 ------------------- 1 file changed, 73 deletions(-) delete mode 100644 "test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" diff --git "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" deleted file mode 100644 index cbc290e965b4..000000000000 --- "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" +++ /dev/null @@ -1,73 +0,0 @@ -""" -Usage: -python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption -""" - -import os -import unittest - -import torch - -from sglang.srt.utils import kill_process_tree -from sglang.test.test_utils import ( - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - CustomTestCase, - popen_launch_server, -) - -if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" -DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( - 8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 -) -DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}" - - -class TestMemoryConsumptionAscend(CustomTestCase): - - def test_memory_consumption(self): - - model = "nytopop/Qwen3-30B-A3B.w8a8" - base_url = DEFAULT_URL_FOR_TEST - - ### Calculate initial used memory - free_npu_memory, total_npu_memory = torch.npu.mem_get_info() - initial_used_memory = total_npu_memory - free_npu_memory - - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--trust-remote-code", - "--device", - "npu", - "--attention-backend", - "ascend", - "--tp-size", - "2", - "--mem-fraction-static", - "0.8", - "--cuda-graph-bs", - "1", - "--max-total-tokens", - "1024", - "--disable-radix-cache", - "--disable-cuda-graph", - ], - ) - - ### Calculate initial used memory - free_npu_memory, total_npu_memory = torch.npu.mem_get_info() - used_memory_after_server_starting = ( - total_npu_memory - free_npu_memory - initial_used_memory - ) / (1 << 30) - self.assertLessEqual(float(used_memory_after_server_starting), 16.00) - - # Clean up everything - kill_process_tree(process.pid) - - -if __name__ == "__main__": - unittest.main() From c501e4ed42f02fdb618658107206b2e32c00a704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:35:24 +0300 Subject: [PATCH 31/36] Add test_ascend_memory_consumption.py --- .../ascend/test_ascend_memory_consumption.py | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 test/srt/ascend/test_ascend_memory_consumption.py diff --git a/test/srt/ascend/test_ascend_memory_consumption.py b/test/srt/ascend/test_ascend_memory_consumption.py new file mode 100644 index 000000000000..cbc290e965b4 --- /dev/null +++ b/test/srt/ascend/test_ascend_memory_consumption.py @@ -0,0 +1,73 @@ +""" +Usage: +python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption +""" + +import os +import unittest + +import torch + +from sglang.srt.utils import kill_process_tree +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + +if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" +DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( + 8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 +) +DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}" + + +class TestMemoryConsumptionAscend(CustomTestCase): + + def test_memory_consumption(self): + + model = "nytopop/Qwen3-30B-A3B.w8a8" + base_url = DEFAULT_URL_FOR_TEST + + ### Calculate initial used memory + free_npu_memory, total_npu_memory = torch.npu.mem_get_info() + initial_used_memory = total_npu_memory - free_npu_memory + + process = popen_launch_server( + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--device", + "npu", + "--attention-backend", + "ascend", + "--tp-size", + "2", + "--mem-fraction-static", + "0.8", + "--cuda-graph-bs", + "1", + "--max-total-tokens", + "1024", + "--disable-radix-cache", + "--disable-cuda-graph", + ], + ) + + ### Calculate initial used memory + free_npu_memory, total_npu_memory = torch.npu.mem_get_info() + used_memory_after_server_starting = ( + total_npu_memory - free_npu_memory - initial_used_memory + ) / (1 << 30) + self.assertLessEqual(float(used_memory_after_server_starting), 16.00) + + # Clean up everything + kill_process_tree(process.pid) + + +if __name__ == "__main__": + unittest.main() From 4824eb47c1dfedab06acdd45b1be3c3702543e9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:47:59 +0300 Subject: [PATCH 32/36] Update run_suite.py --- test/srt/run_suite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 1b6f6adae85a..0e8a86004bdb 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -143,7 +143,7 @@ ], "per-commit-2-npu-a2": [ TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400), - TestFile("ascend/test_ascend_memory_consumption.py‎", 400), + TestFile("ascend/test_ascend_memory_consumption.py", 400), TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400), TestFile("ascend/test_ascend_tp2_bf16.py", 400), TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400), From f398506735c55d0c9d8eb58fe027a534796f9b28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 27 Jan 2026 15:29:56 +0300 Subject: [PATCH 33/36] Move test to test/registered --- test/srt/run_suite.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 0e8a86004bdb..ce724a04cdcc 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -143,7 +143,6 @@ ], "per-commit-2-npu-a2": [ TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400), - TestFile("ascend/test_ascend_memory_consumption.py", 400), TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400), TestFile("ascend/test_ascend_tp2_bf16.py", 400), TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400), From bacb1ee42abd6b652cccb58a987f46be6fb026a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 27 Jan 2026 15:33:14 +0300 Subject: [PATCH 34/36] Move test to test/registered --- .../ascend/test_ascend_memory_consumption.py | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 test/registered/ascend/test_ascend_memory_consumption.py diff --git a/test/registered/ascend/test_ascend_memory_consumption.py b/test/registered/ascend/test_ascend_memory_consumption.py new file mode 100644 index 000000000000..1f1a46d6dda3 --- /dev/null +++ b/test/registered/ascend/test_ascend_memory_consumption.py @@ -0,0 +1,76 @@ +""" +Usage: +python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption +""" + +import os +import unittest + +import torch + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.srt.utils import kill_process_tree +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + +if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" +DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( + 8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 +) +DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}" + + +class TestMemoryConsumptionAscend(CustomTestCase): + + def test_memory_consumption(self): + + model = "nytopop/Qwen3-30B-A3B.w8a8" + base_url = DEFAULT_URL_FOR_TEST + + ### Calculate initial used memory + free_npu_memory, total_npu_memory = torch.npu.mem_get_info() + initial_used_memory = total_npu_memory - free_npu_memory + + process = popen_launch_server( + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--device", + "npu", + "--attention-backend", + "ascend", + "--tp-size", + "2", + "--mem-fraction-static", + "0.8", + "--cuda-graph-bs", + "1", + "--max-total-tokens", + "1024", + "--disable-radix-cache", + "--disable-cuda-graph", + ], + ) + + ### Calculate initial used memory + free_npu_memory, total_npu_memory = torch.npu.mem_get_info() + used_memory_after_server_starting = ( + total_npu_memory - free_npu_memory - initial_used_memory + ) / (1 << 30) + self.assertLessEqual(float(used_memory_after_server_starting), 16.00) + + # Clean up everything + kill_process_tree(process.pid) + + +if __name__ == "__main__": + unittest.main() From 5929c9b3874b7e367463f1002c78e43080838635 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 27 Jan 2026 15:33:34 +0300 Subject: [PATCH 35/36] Delete test/srt/ascend/test_ascend_memory_consumption.py --- .../ascend/test_ascend_memory_consumption.py | 73 ------------------- 1 file changed, 73 deletions(-) delete mode 100644 test/srt/ascend/test_ascend_memory_consumption.py diff --git a/test/srt/ascend/test_ascend_memory_consumption.py b/test/srt/ascend/test_ascend_memory_consumption.py deleted file mode 100644 index cbc290e965b4..000000000000 --- a/test/srt/ascend/test_ascend_memory_consumption.py +++ /dev/null @@ -1,73 +0,0 @@ -""" -Usage: -python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption -""" - -import os -import unittest - -import torch - -from sglang.srt.utils import kill_process_tree -from sglang.test.test_utils import ( - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - CustomTestCase, - popen_launch_server, -) - -if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" -DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( - 8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 -) -DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}" - - -class TestMemoryConsumptionAscend(CustomTestCase): - - def test_memory_consumption(self): - - model = "nytopop/Qwen3-30B-A3B.w8a8" - base_url = DEFAULT_URL_FOR_TEST - - ### Calculate initial used memory - free_npu_memory, total_npu_memory = torch.npu.mem_get_info() - initial_used_memory = total_npu_memory - free_npu_memory - - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--trust-remote-code", - "--device", - "npu", - "--attention-backend", - "ascend", - "--tp-size", - "2", - "--mem-fraction-static", - "0.8", - "--cuda-graph-bs", - "1", - "--max-total-tokens", - "1024", - "--disable-radix-cache", - "--disable-cuda-graph", - ], - ) - - ### Calculate initial used memory - free_npu_memory, total_npu_memory = torch.npu.mem_get_info() - used_memory_after_server_starting = ( - total_npu_memory - free_npu_memory - initial_used_memory - ) / (1 << 30) - self.assertLessEqual(float(used_memory_after_server_starting), 16.00) - - # Clean up everything - kill_process_tree(process.pid) - - -if __name__ == "__main__": - unittest.main() From 8fbe3a1f1720deb322fe5374729ca963578bc0a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com> Date: Tue, 27 Jan 2026 15:51:08 +0300 Subject: [PATCH 36/36] Fix lint issue --- test/registered/ascend/test_ascend_memory_consumption.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/registered/ascend/test_ascend_memory_consumption.py b/test/registered/ascend/test_ascend_memory_consumption.py index 1f1a46d6dda3..2e6b09524476 100644 --- a/test/registered/ascend/test_ascend_memory_consumption.py +++ b/test/registered/ascend/test_ascend_memory_consumption.py @@ -8,8 +8,8 @@ import torch -from sglang.test.ci.ci_register import register_npu_ci from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_npu_ci from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST,