From f1f298c3ad70c775a3c66173bf3ccd8d05d47824 Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Fri, 3 Apr 2026 01:21:42 +0000 Subject: [PATCH 1/2] 1 --- python/sglang/srt/layers/quantization/unquant.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py index 5c2f489e9b53..9c6fef15e418 100644 --- a/python/sglang/srt/layers/quantization/unquant.py +++ b/python/sglang/srt/layers/quantization/unquant.py @@ -233,15 +233,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # because aiter CK kernels don't support all GEMM dimensions _should_use_aiter_moe = _use_aiter and get_moe_runner_backend().is_auto() if _should_use_aiter_moe: - layer.w13_weight = torch.nn.Parameter( - shuffle_weight(layer.w13_weight.data, (16, 16)), - requires_grad=False, - ) + layer.w13_weight.data = shuffle_weight(layer.w13_weight.data, (16, 16)) torch.cuda.empty_cache() - layer.w2_weight = torch.nn.Parameter( - shuffle_weight(layer.w2_weight.data, (16, 16)), - requires_grad=False, - ) + layer.w2_weight.data = shuffle_weight(layer.w2_weight.data, (16, 16)) torch.cuda.empty_cache() # Pack weight for get better performance on CPU From 156f6b9fbd2a2599f55da130711b0cea4cf0a79a Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Fri, 3 Apr 2026 01:31:32 +0000 Subject: [PATCH 2/2] 1 --- python/sglang/srt/layers/quantization/unquant.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py index 9c6fef15e418..94f9a1375c14 100644 --- a/python/sglang/srt/layers/quantization/unquant.py +++ b/python/sglang/srt/layers/quantization/unquant.py @@ -22,7 +22,7 @@ LinearMethodBase, QuantizeMethodBase, ) -from sglang.srt.layers.utils import MultiPlatformOp +from sglang.srt.layers.utils import MultiPlatformOp, copy_or_rebind_param from sglang.srt.utils import ( cpu_has_amx_support, get_bool_env_var, @@ -233,9 +233,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # because aiter CK kernels don't support all GEMM dimensions _should_use_aiter_moe = _use_aiter and get_moe_runner_backend().is_auto() if _should_use_aiter_moe: - layer.w13_weight.data = shuffle_weight(layer.w13_weight.data, (16, 16)) + copy_or_rebind_param( + layer, "w13_weight", shuffle_weight(layer.w13_weight.data, (16, 16)) + ) torch.cuda.empty_cache() - layer.w2_weight.data = shuffle_weight(layer.w2_weight.data, (16, 16)) + copy_or_rebind_param( + layer, "w2_weight", shuffle_weight(layer.w2_weight.data, (16, 16)) + ) torch.cuda.empty_cache() # Pack weight for get better performance on CPU