From f1f298c3ad70c775a3c66173bf3ccd8d05d47824 Mon Sep 17 00:00:00 2001
From: zyzshishui <zyzshishui@gmail.com>
Date: Fri, 3 Apr 2026 01:21:42 +0000
Subject: [PATCH 1/2] 1

---
 python/sglang/srt/layers/quantization/unquant.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
index 5c2f489e9b53..9c6fef15e418 100644
--- a/python/sglang/srt/layers/quantization/unquant.py
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -233,15 +233,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # because aiter CK kernels don't support all GEMM dimensions
         _should_use_aiter_moe = _use_aiter and get_moe_runner_backend().is_auto()
         if _should_use_aiter_moe:
-            layer.w13_weight = torch.nn.Parameter(
-                shuffle_weight(layer.w13_weight.data, (16, 16)),
-                requires_grad=False,
-            )
+            layer.w13_weight.data = shuffle_weight(layer.w13_weight.data, (16, 16))
             torch.cuda.empty_cache()
-            layer.w2_weight = torch.nn.Parameter(
-                shuffle_weight(layer.w2_weight.data, (16, 16)),
-                requires_grad=False,
-            )
+            layer.w2_weight.data = shuffle_weight(layer.w2_weight.data, (16, 16))
             torch.cuda.empty_cache()
 
         # Pack weight for get better performance on CPU

From 156f6b9fbd2a2599f55da130711b0cea4cf0a79a Mon Sep 17 00:00:00 2001
From: zyzshishui <zyzshishui@gmail.com>
Date: Fri, 3 Apr 2026 01:31:32 +0000
Subject: [PATCH 2/2] 1

---
 python/sglang/srt/layers/quantization/unquant.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
index 9c6fef15e418..94f9a1375c14 100644
--- a/python/sglang/srt/layers/quantization/unquant.py
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -22,7 +22,7 @@
     LinearMethodBase,
     QuantizeMethodBase,
 )
-from sglang.srt.layers.utils import MultiPlatformOp
+from sglang.srt.layers.utils import MultiPlatformOp, copy_or_rebind_param
 from sglang.srt.utils import (
     cpu_has_amx_support,
     get_bool_env_var,
@@ -233,9 +233,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # because aiter CK kernels don't support all GEMM dimensions
         _should_use_aiter_moe = _use_aiter and get_moe_runner_backend().is_auto()
         if _should_use_aiter_moe:
-            layer.w13_weight.data = shuffle_weight(layer.w13_weight.data, (16, 16))
+            copy_or_rebind_param(
+                layer, "w13_weight", shuffle_weight(layer.w13_weight.data, (16, 16))
+            )
             torch.cuda.empty_cache()
-            layer.w2_weight.data = shuffle_weight(layer.w2_weight.data, (16, 16))
+            copy_or_rebind_param(
+                layer, "w2_weight", shuffle_weight(layer.w2_weight.data, (16, 16))
+            )
             torch.cuda.empty_cache()
 
         # Pack weight for get better performance on CPU