From 711d21741cd3a8085e80f68c72e38301a6ac2009 Mon Sep 17 00:00:00 2001
From: Edward Shogulin <shogulin.eduard@huawei.com>
Date: Thu, 20 Nov 2025 11:14:17 +0000
Subject: [PATCH 01/36] Qwen3 MOE quick fix

---
 python/sglang/srt/models/qwen3_moe.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/models/qwen3_moe.py b/python/sglang/srt/models/qwen3_moe.py
index 9388b974a33b..be655062f6c3 100644
--- a/python/sglang/srt/models/qwen3_moe.py
+++ b/python/sglang/srt/models/qwen3_moe.py
@@ -68,6 +68,7 @@
 )
 from sglang.srt.server_args import get_global_server_args
 from sglang.srt.utils import (
+    LazyValue,
     add_prefix,
     is_cuda,
     is_flashinfer_available,
@@ -1128,14 +1129,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     else:
                         logger.warning(f"Parameter {name} not found in params_dict")
 
-        # TODO mimic deepseek
-        # Lazy initialization of expert weights cache to avoid slowing down load_weights
         if not hasattr(self, "routed_experts_weights_of_layer"):
-            self.routed_experts_weights_of_layer = {
-                layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
-                for layer_id in range(self.start_layer, self.end_layer)
-                if isinstance(self.model.layers[layer_id].mlp, Qwen3MoeSparseMoeBlock)
-            }
+            self.routed_experts_weights_of_layer = LazyValue(
+                lambda: {
+                    layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
+                    for layer_id in range(self.start_layer, self.end_layer)
+                    if isinstance(
+                        self.model.layers[layer_id].mlp, Qwen3MoeSparseMoeBlock
+                    )
+                }
+            )
 
     @classmethod
     def get_model_config_for_expert_location(cls, config):

From 2b24ec3e09240da3d93cbc30bddf6c03c421356c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 26 Dec 2025 18:08:46 +0300
Subject: [PATCH 02/36] Add nz support for MOE

---
 python/sglang/srt/layers/quantization/unquant.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
index 630b600687b4..9509dce821c2 100644
--- a/python/sglang/srt/layers/quantization/unquant.py
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -25,6 +25,7 @@
     get_bool_env_var,
     is_cpu,
     is_hip,
+    is_npu,
     next_power_of_2,
     set_weight_attrs,
     use_intel_amx_backend,
@@ -40,6 +41,7 @@
 _is_cpu_amx_available = cpu_has_amx_support()
 _is_hip = is_hip()
 _is_cpu = is_cpu()
+_is_npu = is_npu()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 
 if _use_aiter:
@@ -47,12 +49,15 @@
     from aiter.fused_moe import fused_moe
     from aiter.ops.shuffle import shuffle_weight
 
+if _is_npu:
+    import torch_npu
+    NPU_FORMAT_FRACTAL_NZ = 29
+
 try:
     from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
 except ImportError:
     flashinfer_cutlass_fused_moe = None
 
-
 class UnquantizedEmbeddingMethod(QuantizeMethodBase):
     """Unquantized method for embeddings."""
 
@@ -296,6 +301,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 layer.num_local_experts, *new_shape_w2
             )
 
+        if _is_npu:
+            for weight_name in ["w13_weight", "w2_weight"]:
+                weight = getattr(layer, weight_name)
+                weight.data = torch_npu.npu_format_cast(
+                    weight.data, NPU_FORMAT_FRACTAL_NZ
+                )
+
         return
 
     def create_moe_runner(

From 2ec286abba8d7b3158bba05592e923d3fdd4aded Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:48:16 +0300
Subject: [PATCH 03/36] Update python/sglang/srt/layers/quantization/unquant.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 python/sglang/srt/layers/quantization/unquant.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
index 9509dce821c2..1789b6dc9c9f 100644
--- a/python/sglang/srt/layers/quantization/unquant.py
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -302,10 +302,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             )
 
         if _is_npu:
+            from sglang.srt.hardware_backend.npu.utils import NPUACLFormat
             for weight_name in ["w13_weight", "w2_weight"]:
                 weight = getattr(layer, weight_name)
                 weight.data = torch_npu.npu_format_cast(
-                    weight.data, NPU_FORMAT_FRACTAL_NZ
+                    weight.data, NPUACLFormat.ACL_FORMAT_FRACTAL_NZ
                 )
 
         return

From d9a3818d0b9acc4fa321bc38184f4b171acbe7e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:49:09 +0300
Subject: [PATCH 04/36] Update unquant.py

---
 python/sglang/srt/layers/quantization/unquant.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
index 1789b6dc9c9f..197d0b3296ca 100644
--- a/python/sglang/srt/layers/quantization/unquant.py
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -51,7 +51,6 @@
 
 if _is_npu:
     import torch_npu
-    NPU_FORMAT_FRACTAL_NZ = 29
 
 try:
     from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe

From 1ad1ca16868873f768895775326def3d8f9ceba4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:49:47 +0300
Subject: [PATCH 05/36] Update unquant.py

---
 python/sglang/srt/layers/quantization/unquant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
index 197d0b3296ca..bd21d2c34d8c 100644
--- a/python/sglang/srt/layers/quantization/unquant.py
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -51,6 +51,7 @@
 
 if _is_npu:
     import torch_npu
+    from sglang.srt.hardware_backend.npu.utils import NPUACLFormat
 
 try:
     from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
@@ -301,7 +302,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             )
 
         if _is_npu:
-            from sglang.srt.hardware_backend.npu.utils import NPUACLFormat
             for weight_name in ["w13_weight", "w2_weight"]:
                 weight = getattr(layer, weight_name)
                 weight.data = torch_npu.npu_format_cast(

From e5484c94e4ce3acf34626aaa6dd6a59b793e6476 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 29 Dec 2025 16:53:13 +0300
Subject: [PATCH 06/36] Fix lint issue

---
 python/sglang/srt/layers/quantization/unquant.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
index bd21d2c34d8c..b9df37c0ed67 100644
--- a/python/sglang/srt/layers/quantization/unquant.py
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -51,6 +51,7 @@
 
 if _is_npu:
     import torch_npu
+
     from sglang.srt.hardware_backend.npu.utils import NPUACLFormat
 
 try:
@@ -58,6 +59,7 @@
 except ImportError:
     flashinfer_cutlass_fused_moe = None
 
+
 class UnquantizedEmbeddingMethod(QuantizeMethodBase):
     """Unquantized method for embeddings."""
 

From 25a0e56560e3de8980b55bac812d32837f5cc678 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 12:47:07 +0300
Subject: [PATCH 07/36] Remove a non-used env ENABLE_ASCEND_MOE_NZ variable
 from ascend_npu_qwen3_examples.md

---
 docs/platforms/ascend_npu_qwen3_examples.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/platforms/ascend_npu_qwen3_examples.md b/docs/platforms/ascend_npu_qwen3_examples.md
index 958ad8c97398..5278a22a1001 100644
--- a/docs/platforms/ascend_npu_qwen3_examples.md
+++ b/docs/platforms/ascend_npu_qwen3_examples.md
@@ -62,7 +62,6 @@ export HCCL_BUFFSIZE=1536
 export HCCL_OP_EXPANSION_MODE=AIV
 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=32
 export SGLANG_DEEPEP_BF16_DISPATCH=1
-export ENABLE_ASCEND_MOE_NZ=1
 
 python -m sglang.launch_server \
    --device npu \
@@ -84,7 +83,6 @@ export STREAMS_PER_DEVICE=32
 export HCCL_BUFFSIZE=1536
 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=32
 export SGLANG_DEEPEP_BF16_DISPATCH=1
-export ENABLE_ASCEND_MOE_NZ=1
 
 python -m sglang.launch_server \
    --model-path Qwen/Qwen3-235B-A22B-Instruct-2507 \

From 61830a2a041c00aab8f2fe7c2b450faab063d5fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 12:48:53 +0300
Subject: [PATCH 08/36] Remove a non-used env ENABLE_MOE_NZ variable from
 ascend_npu_qwen3_examples.md

---
 docs/platforms/ascend_npu_deepseek_example.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/platforms/ascend_npu_deepseek_example.md b/docs/platforms/ascend_npu_deepseek_example.md
index d75b942704b2..ad9d4bd7a078 100644
--- a/docs/platforms/ascend_npu_deepseek_example.md
+++ b/docs/platforms/ascend_npu_deepseek_example.md
@@ -22,7 +22,6 @@ export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
 #npu acceleration operator
 export SGLANG_NPU_USE_MLAPO=1
 export SGLANG_USE_FIA_NZ=1
-export ENABLE_MOE_NZ=1
 
 python3 -m sglang.launch_server \
     --model-path ${MODEL_PATH} \
@@ -71,7 +70,6 @@ export HCCL_BUFFSIZE=1536
 #npu acceleration operator
 export SGLANG_NPU_USE_MLAPO=1
 export SGLANG_USE_FIA_NZ=1
-export ENABLE_MOE_NZ=1
 export TASK_QUEUE_ENABLE=2
 
 python -m sglang.launch_server \

From f586b40829e0f10f817a57e1d81167c2620d1732 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 13:04:55 +0300
Subject: [PATCH 09/36] Update NZ converison

---
 python/sglang/srt/layers/moe/ep_moe/layer.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py
index 53c25b069ffb..316ae0ac669d 100644
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -31,6 +31,7 @@
 from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
 from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod
 from sglang.srt.utils import get_bool_env_var, is_hip, is_npu
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.token_dispatcher import (
@@ -472,13 +473,6 @@ def forward(
             gmm2_weight_scale=self.w2_weight_scale,
         ).hidden_state
 
-    def release_weight_cache(self, weight: torch.Tensor):
-        # .contiguous() introduces additional memory overhead and needs to be released using resize_(0)
-        origin_weight = weight.data.transpose(1, 2)
-        new_weight = origin_weight.contiguous()
-        origin_weight.untyped_storage().resize_(0)
-        return new_weight
-
     def permute_w13_weight_scale(self, w: torch.Tensor, tile_n: int):
         if tile_n % 2 != 0:
             raise ValueError(f"tile_n must be even, got {tile_n}")
@@ -520,14 +514,13 @@ def reshape_w13_weight(self, weight: torch.Tensor, dim: int, chunk_size: int = 6
         return weight.view(*original_shape[:dim], -1, *original_shape[dim + 1 :])
 
     def _process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        w13 = self.release_weight_cache(layer.w13_weight)
-        torch_npu.npu_format_cast_(w13, 2)
+        w13 = layer.w13_weight
         cpu_w13 = w13.cpu()
         w13 = self.reshape_w13_weight(cpu_w13, -1).npu()
-        torch_npu.npu_format_cast_(w13, 29)
+        w13 = npu_format_cast(w13)
         layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False)
 
-        w2 = torch_npu.npu_format_cast(layer.w2_weight.data, 29)
+        w2 = npu_format_cast(w2)
         layer.w2_weight = torch.nn.Parameter(w2, requires_grad=False)
 
         w13_scale = layer.w13_weight_scale.data.squeeze(-1).contiguous()

From 4d38ade472e907148d60084c00e32ab29d9c301a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 13:30:19 +0300
Subject: [PATCH 10/36] Remove unnecessary function

---
 .../npu/quantization/fused_moe_method_npu.py        | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index e1b2f6e2b378..d16ea13fe402 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -212,20 +212,7 @@ def create_weights(
         layer.register_parameter("w2_weight_offset", w2_weight_offset)
         set_weight_attrs(w2_weight_offset, extra_weight_attrs)
 
-    def release_weight_cache(self, weight: torch.Tensor):
-        # .contiguous() introduces additional memory overhead and needs to be released using resize_(0)
-        origin_weight = weight.data.transpose(1, 2)
-        new_weight = origin_weight.contiguous()
-        origin_weight.untyped_storage().resize_(0)
-        return new_weight
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        weight_data = self.release_weight_cache(layer.w13_weight.data)
-        layer.w13_weight = torch.nn.Parameter(weight_data, requires_grad=False)
-
-        weight_data = self.release_weight_cache(layer.w2_weight.data)
-        layer.w2_weight = torch.nn.Parameter(weight_data, requires_grad=False)
-
         layer.w13_weight_scale = torch.nn.Parameter(
             layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32),
             requires_grad=False,

From 2f4608d69197b8b3add9d82170a72e322cc62b07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 15:19:44 +0300
Subject: [PATCH 11/36] Update layer.py

---
 python/sglang/srt/layers/moe/ep_moe/layer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py
index 316ae0ac669d..ce7e15f6900d 100644
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -514,13 +514,12 @@ def reshape_w13_weight(self, weight: torch.Tensor, dim: int, chunk_size: int = 6
         return weight.view(*original_shape[:dim], -1, *original_shape[dim + 1 :])
 
     def _process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        w13 = layer.w13_weight
-        cpu_w13 = w13.cpu()
+        cpu_w13 = layer.w13_weight.cpu()
         w13 = self.reshape_w13_weight(cpu_w13, -1).npu()
         w13 = npu_format_cast(w13)
         layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False)
 
-        w2 = npu_format_cast(w2)
+        w2 = npu_format_cast(layer.w2_weight.data)
         layer.w2_weight = torch.nn.Parameter(w2, requires_grad=False)
 
         w13_scale = layer.w13_weight_scale.data.squeeze(-1).contiguous()

From 36992882163a28553f4a54622e89a33679536f83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 15:23:09 +0300
Subject: [PATCH 12/36] Update unquant.py

---
 python/sglang/srt/layers/quantization/unquant.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
index c980bacaa351..c3637364e8c2 100644
--- a/python/sglang/srt/layers/quantization/unquant.py
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -52,7 +52,7 @@
 if _is_npu:
     import torch_npu
 
-    from sglang.srt.hardware_backend.npu.utils import NPUACLFormat
+    from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 
 try:
     from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
@@ -306,8 +306,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if _is_npu:
             for weight_name in ["w13_weight", "w2_weight"]:
                 weight = getattr(layer, weight_name)
-                weight.data = torch_npu.npu_format_cast(
-                    weight.data, NPUACLFormat.ACL_FORMAT_FRACTAL_NZ
+                weight.data = npu_format_cast(
+                    weight.data,
                 )
 
         return

From 3092b3148eb005f76ffa832ad1d5208c8a6d01f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 16:57:16 +0300
Subject: [PATCH 13/36] Update layer.py

---
 python/sglang/srt/layers/moe/ep_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py
index ce7e15f6900d..13687dc4646f 100644
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -514,7 +514,7 @@ def reshape_w13_weight(self, weight: torch.Tensor, dim: int, chunk_size: int = 6
         return weight.view(*original_shape[:dim], -1, *original_shape[dim + 1 :])
 
     def _process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        cpu_w13 = layer.w13_weight.cpu()
+        cpu_w13 = layer.w13_weight.data.transpose(1, 2).contiguous().cpu()
         w13 = self.reshape_w13_weight(cpu_w13, -1).npu()
         w13 = npu_format_cast(w13)
         layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False)

From fe2aed78234288e086eb22314d1362a3737b0197 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 18:31:18 +0300
Subject: [PATCH 14/36] Update layer.py

---
 python/sglang/srt/layers/moe/ep_moe/layer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py
index 13687dc4646f..23700047dc2e 100644
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -514,12 +514,12 @@ def reshape_w13_weight(self, weight: torch.Tensor, dim: int, chunk_size: int = 6
         return weight.view(*original_shape[:dim], -1, *original_shape[dim + 1 :])
 
     def _process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        cpu_w13 = layer.w13_weight.data.transpose(1, 2).contiguous().cpu()
+        cpu_w13 = layer.w13_weight.transpose(1, 2).cpu()
         w13 = self.reshape_w13_weight(cpu_w13, -1).npu()
         w13 = npu_format_cast(w13)
         layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False)
 
-        w2 = npu_format_cast(layer.w2_weight.data)
+        w2 = npu_format_cast(layer.w2_weight)
         layer.w2_weight = torch.nn.Parameter(w2, requires_grad=False)
 
         w13_scale = layer.w13_weight_scale.data.squeeze(-1).contiguous()

From 1054c9d634e1543f13be3050cf6019c04e440e36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 13 Jan 2026 19:03:36 +0300
Subject: [PATCH 15/36] Update fused_moe_method_npu.py

---
 .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index d16ea13fe402..a9b6584796fd 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -227,8 +227,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
         )
 
-        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
-        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
+        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2))
+        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2))
 
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"

From da5158b4642af4c8572bde78d465138b9dce8037 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Wed, 14 Jan 2026 11:49:51 +0300
Subject: [PATCH 16/36] Update fused_moe_method_npu.py

---
 .../npu/quantization/fused_moe_method_npu.py                | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index a9b6584796fd..670887425a32 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -213,6 +213,9 @@ def create_weights(
         set_weight_attrs(w2_weight_offset, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2).contiguous())
+        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2).contiguous())
+        
         layer.w13_weight_scale = torch.nn.Parameter(
             layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32),
             requires_grad=False,
@@ -227,9 +230,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
         )
 
-        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2))
-        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2))
-
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
     ):

From 0162b74db5cd71722959ec165bae7afa52aec5e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Wed, 14 Jan 2026 14:09:59 +0300
Subject: [PATCH 17/36] Update fused_moe_method_npu.py

---
 .../npu/quantization/fused_moe_method_npu.py             | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 670887425a32..cdbe973827f0 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -217,17 +217,16 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2).contiguous())
         
         layer.w13_weight_scale = torch.nn.Parameter(
-            layer.w13_weight_scale.data.squeeze(-1).contiguous().to(torch.float32),
-            requires_grad=False,
+            layer.w13_weight_scale.data.squeeze(-1), requires_grad=False
         )
         layer.w2_weight_scale = torch.nn.Parameter(
-            layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False
+            layer.w2_weight_scale.data.squeeze(-1), requires_grad=False
         )
         layer.w13_weight_offset = torch.nn.Parameter(
-            layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+            layer.w13_weight_offset.data.squeeze(-1), requires_grad=False
         )
         layer.w2_weight_offset = torch.nn.Parameter(
-            layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+            layer.w2_weight_offset.data.squeeze(-1), requires_grad=False
         )
 
     def create_moe_runner(

From 019e2d6657a7db480ac5c4d5512321d562d1871f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Wed, 14 Jan 2026 15:09:43 +0300
Subject: [PATCH 18/36] Update fused_moe_method_npu.py

---
 .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index cdbe973827f0..bbd46a584275 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -213,8 +213,8 @@ def create_weights(
         set_weight_attrs(w2_weight_offset, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2).contiguous())
-        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2).contiguous())
+        layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2))
+        layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2))
         
         layer.w13_weight_scale = torch.nn.Parameter(
             layer.w13_weight_scale.data.squeeze(-1), requires_grad=False

From d02b451d6cae56162c57f1cdf953adc6d30248f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Wed, 14 Jan 2026 17:30:48 +0300
Subject: [PATCH 19/36] =?UTF-8?q?Create=20test=5Fascend=5Fmemory=5Fconsump?=
 =?UTF-8?q?tion.py=E2=80=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ..._ascend_memory_consumption.py\342\200\216" | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 "test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"

diff --git "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"
new file mode 100644
index 000000000000..fe0d9f58d4ff
--- /dev/null
+++ "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"
@@ -0,0 +1,73 @@
+"""
+Usage:
+python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption
+"""
+
+import os
+import unittest
+
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
+DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
+    8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
+)
+DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
+
+
+class TestMemoryConsumptionAscend(CustomTestCase):
+
+    def test_memory_consumption(self):
+
+        model = "nytopop/Qwen3-30B-A3B.w8a8"
+        base_url = DEFAULT_URL_FOR_TEST
+
+        ### Calculate initial used memory
+        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
+        initial_used_memory = total_npu_memory - free_npu_memory
+
+        process = popen_launch_server(
+            model,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--device",
+                "npu",
+                "--attention-backend",
+                "ascend",
+                "--tp-size",
+                "2",
+                "--mem-fraction-static",
+                "0.8",
+                "--cuda-graph-bs",
+                "1",
+                "--max-total-tokens",
+                "1024",
+                "--disable-radix-cache",
+                "--disable-cuda-graph",
+            ],
+        )
+
+        ### Calculate initial used memory
+        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
+        used_memory_after_server_starting = (
+            total_npu_memory - free_npu_memory - initial_used_memory
+        ) / (1 << 30)
+        self.assertLessEqual(float(used_memory_after_server_starting), 17.00)
+
+        # Clean up everything
+        kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()

From edbba1b185a116838e02d0ae5a7bb71a800efb31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 15 Jan 2026 15:26:33 +0300
Subject: [PATCH 20/36] Fix lint issue

---
 .../hardware_backend/npu/quantization/fused_moe_method_npu.py    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index c44fed1ab957..3387a5848550 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -153,7 +153,6 @@ class NPUW8A8Int8DynamicMoEMethod(_NPUFusedMoEMethodBase):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2))
         layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2))
-        
         layer.w13_weight_scale = torch.nn.Parameter(
             layer.w13_weight_scale.data.squeeze(-1), requires_grad=False
         )

From fa13828abf1cf66433d2d77f74cbe7b76582a42a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 15 Jan 2026 15:27:17 +0300
Subject: [PATCH 21/36] Fix lint issue

---
 python/sglang/srt/layers/moe/ep_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py
index d34a01425e66..7e8bb33ca70a 100644
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -7,6 +7,7 @@
 
 from sglang.srt.compilation.piecewise_context_manager import is_in_piecewise_cuda_graph
 from sglang.srt.environ import envs
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 from sglang.srt.layers import deep_gemm_wrapper
 from sglang.srt.layers.moe import (
     get_deepep_mode,
@@ -31,7 +32,6 @@
 from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
 from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod
 from sglang.srt.utils import get_bool_env_var, is_hip, is_npu
-from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.token_dispatcher import (

From c78449b6e20adc5144a6957bb8fa62eac50863fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Thu, 15 Jan 2026 15:28:02 +0300
Subject: [PATCH 22/36] Fix lint issue

---
 python/sglang/srt/layers/quantization/unquant.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
index c3637364e8c2..610e1a848e35 100644
--- a/python/sglang/srt/layers/quantization/unquant.py
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -50,8 +50,6 @@
     from aiter.ops.shuffle import shuffle_weight
 
 if _is_npu:
-    import torch_npu
-
     from sglang.srt.hardware_backend.npu.utils import npu_format_cast
 
 try:

From 5888bd6ce8b823c645b50662bd465f3b288d7c37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 16 Jan 2026 13:42:14 +0300
Subject: [PATCH 23/36] Update fused_moe_method_npu.py

---
 .../hardware_backend/npu/quantization/fused_moe_method_npu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
index 3387a5848550..b3bd7c2155e6 100644
--- a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
+++ b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -162,12 +162,12 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Compressed-tensors format doesn't have this field
         if hasattr(layer, "w13_weight_offset"):
             layer.w13_weight_offset = torch.nn.Parameter(
-                layer.w13_weight_offset.data.squeeze(-1).contiguous(),
+                layer.w13_weight_offset.data.squeeze(-1),
                 requires_grad=False,
             )
         if hasattr(layer, "w2_weight_offset"):
             layer.w2_weight_offset = torch.nn.Parameter(
-                layer.w2_weight_offset.data.squeeze(-1).contiguous(),
+                layer.w2_weight_offset.data.squeeze(-1),
                 requires_grad=False,
             )
 

From ab233ad772a66a65f275ba9c405a374d7b99685f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 16 Jan 2026 14:56:25 +0300
Subject: [PATCH 24/36] =?UTF-8?q?Update=20test=5Fascend=5Fmemory=5Fconsump?=
 =?UTF-8?q?tion.py=E2=80=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 "test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"
index fe0d9f58d4ff..cbc290e965b4 100644
--- "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"
+++ "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"
@@ -63,7 +63,7 @@ class TestMemoryConsumptionAscend(CustomTestCase):
         used_memory_after_server_starting = (
             total_npu_memory - free_npu_memory - initial_used_memory
         ) / (1 << 30)
-        self.assertLessEqual(float(used_memory_after_server_starting), 17.00)
+        self.assertLessEqual(float(used_memory_after_server_starting), 16.00)
 
         # Clean up everything
         kill_process_tree(process.pid)

From 5ceeab12c0c5e15c63f0dae65829f333de609e3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Fri, 16 Jan 2026 15:04:04 +0300
Subject: [PATCH 25/36] Update run_suite.py

---
 test/srt/run_suite.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index fbc7c8154476..b6f3ea25daa4 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -165,6 +165,7 @@
     ],
     "per-commit-2-npu-a2": [
         TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400),
+        TestFile("ascend/test_ascend_memory_consumption.py‎", 400),
         TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400),
         TestFile("ascend/test_ascend_tp2_bf16.py", 400),
         TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400),

From b8d8285b2ac951f0c6300cea89b1e0399c24db5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 26 Jan 2026 18:30:54 +0300
Subject: [PATCH 26/36] Move transpose(1,2) from forward_npu() to
 process_weights

---
 python/sglang/srt/layers/quantization/unquant.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
index a197c83b8eb9..6ae02aaec67b 100644
--- a/python/sglang/srt/layers/quantization/unquant.py
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -304,6 +304,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if _is_npu:
             for weight_name in ["w13_weight", "w2_weight"]:
                 weight = getattr(layer, weight_name)
+                weight.data = weight.data.transpose(1, 2)
                 weight.data = npu_format_cast(
                     weight.data,
                 )
@@ -506,9 +507,6 @@ def forward_npu(
         expert_tokens = expert_tokens.to(torch.int64)
         w13_bias = [layer.w13_weight_bias] if self.with_bias else None
         w2_bias = [layer.w2_weight_bias] if self.with_bias else None
-        if layer.w13_weight.shape[-1] == layer.hidden_size:
-            w13 = layer.w13_weight.transpose(1, 2)
-            w2 = layer.w2_weight.transpose(1, 2)
 
         # gmm1: gate_up_proj
         hidden_states = torch_npu.npu_grouped_matmul(

From a80de0b42d9639638aa0898398d7e80ab98383d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Mon, 26 Jan 2026 18:48:25 +0300
Subject: [PATCH 27/36] Quickfix

---
 python/sglang/srt/layers/quantization/unquant.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
index 6ae02aaec67b..628fadbd166e 100644
--- a/python/sglang/srt/layers/quantization/unquant.py
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -511,7 +511,7 @@ def forward_npu(
         # gmm1: gate_up_proj
         hidden_states = torch_npu.npu_grouped_matmul(
             x=[hidden_states],
-            weight=[w13],
+            weight=[layer.w13_weight],
             bias=w13_bias,
             split_item=2,
             group_list_type=0,
@@ -535,7 +535,7 @@ def forward_npu(
         # gmm2: down_proj
         hidden_states = torch_npu.npu_grouped_matmul(
             x=[hidden_states],
-            weight=[w2],
+            weight=[layer.w2_weight],
             bias=w2_bias,
             split_item=2,
             group_list_type=0,

From 47a5d8aa6b38814b78d1473257a72ab68dea0764 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 27 Jan 2026 14:32:55 +0300
Subject: [PATCH 28/36] =?UTF-8?q?Delete=20test/srt/ascend/test=5Fascend=5F?=
 =?UTF-8?q?memory=5Fconsumption.py=E2=80=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ..._ascend_memory_consumption.py\342\200\216" | 73 -------------------
 1 file changed, 73 deletions(-)
 delete mode 100644 "test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"

diff --git "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"
deleted file mode 100644
index cbc290e965b4..000000000000
--- "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"
+++ /dev/null
@@ -1,73 +0,0 @@
-"""
-Usage:
-python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption
-"""
-
-import os
-import unittest
-
-import torch
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.test_utils import (
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    CustomTestCase,
-    popen_launch_server,
-)
-
-if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
-    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
-DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
-    8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
-)
-DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
-
-
-class TestMemoryConsumptionAscend(CustomTestCase):
-
-    def test_memory_consumption(self):
-
-        model = "nytopop/Qwen3-30B-A3B.w8a8"
-        base_url = DEFAULT_URL_FOR_TEST
-
-        ### Calculate initial used memory
-        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
-        initial_used_memory = total_npu_memory - free_npu_memory
-
-        process = popen_launch_server(
-            model,
-            base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--device",
-                "npu",
-                "--attention-backend",
-                "ascend",
-                "--tp-size",
-                "2",
-                "--mem-fraction-static",
-                "0.8",
-                "--cuda-graph-bs",
-                "1",
-                "--max-total-tokens",
-                "1024",
-                "--disable-radix-cache",
-                "--disable-cuda-graph",
-            ],
-        )
-
-        ### Calculate initial used memory
-        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
-        used_memory_after_server_starting = (
-            total_npu_memory - free_npu_memory - initial_used_memory
-        ) / (1 << 30)
-        self.assertLessEqual(float(used_memory_after_server_starting), 16.00)
-
-        # Clean up everything
-        kill_process_tree(process.pid)
-
-
-if __name__ == "__main__":
-    unittest.main()

From 87d6963ce4e27c6fb8973fe3d5eec5f023bad6cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 27 Jan 2026 14:33:21 +0300
Subject: [PATCH 29/36] =?UTF-8?q?Rename=20test=5Fascend=5Fmemory=5Fconsump?=
 =?UTF-8?q?tion.py=E2=80=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ..._ascend_memory_consumption.py\342\200\216" | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 "test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"

diff --git "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"
new file mode 100644
index 000000000000..cbc290e965b4
--- /dev/null
+++ "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"
@@ -0,0 +1,73 @@
+"""
+Usage:
+python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption
+"""
+
+import os
+import unittest
+
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
+DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
+    8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
+)
+DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
+
+
+class TestMemoryConsumptionAscend(CustomTestCase):
+
+    def test_memory_consumption(self):
+
+        model = "nytopop/Qwen3-30B-A3B.w8a8"
+        base_url = DEFAULT_URL_FOR_TEST
+
+        ### Calculate initial used memory
+        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
+        initial_used_memory = total_npu_memory - free_npu_memory
+
+        process = popen_launch_server(
+            model,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--device",
+                "npu",
+                "--attention-backend",
+                "ascend",
+                "--tp-size",
+                "2",
+                "--mem-fraction-static",
+                "0.8",
+                "--cuda-graph-bs",
+                "1",
+                "--max-total-tokens",
+                "1024",
+                "--disable-radix-cache",
+                "--disable-cuda-graph",
+            ],
+        )
+
+        ### Calculate initial used memory
+        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
+        used_memory_after_server_starting = (
+            total_npu_memory - free_npu_memory - initial_used_memory
+        ) / (1 << 30)
+        self.assertLessEqual(float(used_memory_after_server_starting), 16.00)
+
+        # Clean up everything
+        kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 105e0632eb7196f7ded57a13bcc9a1b397159381 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 27 Jan 2026 14:34:49 +0300
Subject: [PATCH 30/36] =?UTF-8?q?Delete=20test/srt/ascend/test=5Fascend=5F?=
 =?UTF-8?q?memory=5Fconsumption.py=E2=80=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ..._ascend_memory_consumption.py\342\200\216" | 73 -------------------
 1 file changed, 73 deletions(-)
 delete mode 100644 "test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"

diff --git "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216" "b/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"
deleted file mode 100644
index cbc290e965b4..000000000000
--- "a/test/srt/ascend/test_ascend_memory_consumption.py\342\200\216"
+++ /dev/null
@@ -1,73 +0,0 @@
-"""
-Usage:
-python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption
-"""
-
-import os
-import unittest
-
-import torch
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.test_utils import (
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    CustomTestCase,
-    popen_launch_server,
-)
-
-if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
-    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
-DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
-    8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
-)
-DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
-
-
-class TestMemoryConsumptionAscend(CustomTestCase):
-
-    def test_memory_consumption(self):
-
-        model = "nytopop/Qwen3-30B-A3B.w8a8"
-        base_url = DEFAULT_URL_FOR_TEST
-
-        ### Calculate initial used memory
-        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
-        initial_used_memory = total_npu_memory - free_npu_memory
-
-        process = popen_launch_server(
-            model,
-            base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--device",
-                "npu",
-                "--attention-backend",
-                "ascend",
-                "--tp-size",
-                "2",
-                "--mem-fraction-static",
-                "0.8",
-                "--cuda-graph-bs",
-                "1",
-                "--max-total-tokens",
-                "1024",
-                "--disable-radix-cache",
-                "--disable-cuda-graph",
-            ],
-        )
-
-        ### Calculate initial used memory
-        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
-        used_memory_after_server_starting = (
-            total_npu_memory - free_npu_memory - initial_used_memory
-        ) / (1 << 30)
-        self.assertLessEqual(float(used_memory_after_server_starting), 16.00)
-
-        # Clean up everything
-        kill_process_tree(process.pid)
-
-
-if __name__ == "__main__":
-    unittest.main()

From c501e4ed42f02fdb618658107206b2e32c00a704 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 27 Jan 2026 14:35:24 +0300
Subject: [PATCH 31/36] Add test_ascend_memory_consumption.py

---
 .../ascend/test_ascend_memory_consumption.py  | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 test/srt/ascend/test_ascend_memory_consumption.py

diff --git a/test/srt/ascend/test_ascend_memory_consumption.py b/test/srt/ascend/test_ascend_memory_consumption.py
new file mode 100644
index 000000000000..cbc290e965b4
--- /dev/null
+++ b/test/srt/ascend/test_ascend_memory_consumption.py
@@ -0,0 +1,73 @@
+"""
+Usage:
+python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption
+"""
+
+import os
+import unittest
+
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
+DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
+    8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
+)
+DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
+
+
+class TestMemoryConsumptionAscend(CustomTestCase):
+
+    def test_memory_consumption(self):
+
+        model = "nytopop/Qwen3-30B-A3B.w8a8"
+        base_url = DEFAULT_URL_FOR_TEST
+
+        ### Calculate initial used memory
+        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
+        initial_used_memory = total_npu_memory - free_npu_memory
+
+        process = popen_launch_server(
+            model,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--device",
+                "npu",
+                "--attention-backend",
+                "ascend",
+                "--tp-size",
+                "2",
+                "--mem-fraction-static",
+                "0.8",
+                "--cuda-graph-bs",
+                "1",
+                "--max-total-tokens",
+                "1024",
+                "--disable-radix-cache",
+                "--disable-cuda-graph",
+            ],
+        )
+
+        ### Calculate initial used memory
+        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
+        used_memory_after_server_starting = (
+            total_npu_memory - free_npu_memory - initial_used_memory
+        ) / (1 << 30)
+        self.assertLessEqual(float(used_memory_after_server_starting), 16.00)
+
+        # Clean up everything
+        kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 4824eb47c1dfedab06acdd45b1be3c3702543e9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 27 Jan 2026 14:47:59 +0300
Subject: [PATCH 32/36] Update run_suite.py

---
 test/srt/run_suite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 1b6f6adae85a..0e8a86004bdb 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -143,7 +143,7 @@
     ],
     "per-commit-2-npu-a2": [
         TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400),
-        TestFile("ascend/test_ascend_memory_consumption.py‎", 400),
+        TestFile("ascend/test_ascend_memory_consumption.py", 400),
         TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400),
         TestFile("ascend/test_ascend_tp2_bf16.py", 400),
         TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400),

From f398506735c55d0c9d8eb58fe027a534796f9b28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 27 Jan 2026 15:29:56 +0300
Subject: [PATCH 33/36] Move test to test/registered

---
 test/srt/run_suite.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 0e8a86004bdb..ce724a04cdcc 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -143,7 +143,6 @@
     ],
     "per-commit-2-npu-a2": [
         TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400),
-        TestFile("ascend/test_ascend_memory_consumption.py", 400),
         TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400),
         TestFile("ascend/test_ascend_tp2_bf16.py", 400),
         TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400),

From bacb1ee42abd6b652cccb58a987f46be6fb026a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 27 Jan 2026 15:33:14 +0300
Subject: [PATCH 34/36] Move test to test/registered

---
 .../ascend/test_ascend_memory_consumption.py  | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 test/registered/ascend/test_ascend_memory_consumption.py

diff --git a/test/registered/ascend/test_ascend_memory_consumption.py b/test/registered/ascend/test_ascend_memory_consumption.py
new file mode 100644
index 000000000000..1f1a46d6dda3
--- /dev/null
+++ b/test/registered/ascend/test_ascend_memory_consumption.py
@@ -0,0 +1,76 @@
+"""
+Usage:
+python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption
+"""
+
+import os
+import unittest
+
+import torch
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
+DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
+    8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
+)
+DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
+
+
+class TestMemoryConsumptionAscend(CustomTestCase):
+
+    def test_memory_consumption(self):
+
+        model = "nytopop/Qwen3-30B-A3B.w8a8"
+        base_url = DEFAULT_URL_FOR_TEST
+
+        ### Calculate initial used memory
+        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
+        initial_used_memory = total_npu_memory - free_npu_memory
+
+        process = popen_launch_server(
+            model,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--device",
+                "npu",
+                "--attention-backend",
+                "ascend",
+                "--tp-size",
+                "2",
+                "--mem-fraction-static",
+                "0.8",
+                "--cuda-graph-bs",
+                "1",
+                "--max-total-tokens",
+                "1024",
+                "--disable-radix-cache",
+                "--disable-cuda-graph",
+            ],
+        )
+
+        ### Calculate initial used memory
+        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
+        used_memory_after_server_starting = (
+            total_npu_memory - free_npu_memory - initial_used_memory
+        ) / (1 << 30)
+        self.assertLessEqual(float(used_memory_after_server_starting), 16.00)
+
+        # Clean up everything
+        kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 5929c9b3874b7e367463f1002c78e43080838635 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 27 Jan 2026 15:33:34 +0300
Subject: [PATCH 35/36] Delete
 test/srt/ascend/test_ascend_memory_consumption.py

---
 .../ascend/test_ascend_memory_consumption.py  | 73 -------------------
 1 file changed, 73 deletions(-)
 delete mode 100644 test/srt/ascend/test_ascend_memory_consumption.py

diff --git a/test/srt/ascend/test_ascend_memory_consumption.py b/test/srt/ascend/test_ascend_memory_consumption.py
deleted file mode 100644
index cbc290e965b4..000000000000
--- a/test/srt/ascend/test_ascend_memory_consumption.py
+++ /dev/null
@@ -1,73 +0,0 @@
-"""
-Usage:
-python3 -m unittest test_ascend_memory_consumption.TestMemoryConsumptionAscend.test_memory_consumption
-"""
-
-import os
-import unittest
-
-import torch
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.test_utils import (
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    CustomTestCase,
-    popen_launch_server,
-)
-
-if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
-    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
-DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
-    8000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
-)
-DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
-
-
-class TestMemoryConsumptionAscend(CustomTestCase):
-
-    def test_memory_consumption(self):
-
-        model = "nytopop/Qwen3-30B-A3B.w8a8"
-        base_url = DEFAULT_URL_FOR_TEST
-
-        ### Calculate initial used memory
-        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
-        initial_used_memory = total_npu_memory - free_npu_memory
-
-        process = popen_launch_server(
-            model,
-            base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--device",
-                "npu",
-                "--attention-backend",
-                "ascend",
-                "--tp-size",
-                "2",
-                "--mem-fraction-static",
-                "0.8",
-                "--cuda-graph-bs",
-                "1",
-                "--max-total-tokens",
-                "1024",
-                "--disable-radix-cache",
-                "--disable-cuda-graph",
-            ],
-        )
-
-        ### Calculate initial used memory
-        free_npu_memory, total_npu_memory = torch.npu.mem_get_info()
-        used_memory_after_server_starting = (
-            total_npu_memory - free_npu_memory - initial_used_memory
-        ) / (1 << 30)
-        self.assertLessEqual(float(used_memory_after_server_starting), 16.00)
-
-        # Clean up everything
-        kill_process_tree(process.pid)
-
-
-if __name__ == "__main__":
-    unittest.main()

From 8fbe3a1f1720deb322fe5374729ca963578bc0a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC=20=D0=A1=D0=B0=D0=B2=D0=BA?=
 =?UTF-8?q?=D0=B8=D0=BD?= <58187114+OrangeRedeng@users.noreply.github.com>
Date: Tue, 27 Jan 2026 15:51:08 +0300
Subject: [PATCH 36/36] Fix lint issue

---
 test/registered/ascend/test_ascend_memory_consumption.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/registered/ascend/test_ascend_memory_consumption.py b/test/registered/ascend/test_ascend_memory_consumption.py
index 1f1a46d6dda3..2e6b09524476 100644
--- a/test/registered/ascend/test_ascend_memory_consumption.py
+++ b/test/registered/ascend/test_ascend_memory_consumption.py
@@ -8,8 +8,8 @@
 
 import torch
 
-from sglang.test.ci.ci_register import register_npu_ci
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_npu_ci
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,