sgl-project · chunyuan-w · Mar 14, 2025 · Mar 19, 2025 · gemini-code-assist · May 27, 2025
@@ -30,7 +30,12 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import (
+    _process_weight_after_loading,
+    cpu_has_amx_support,
+    prepack_weight_if_needed,
+    set_weight_attrs,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -165,13 +170,21 @@ def create_weights(
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, extra_weight_attrs)
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        _process_weight_after_loading(layer, ["weight"])
+
     def apply(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
+        if layer.use_intel_amx_backend:
+            return torch.ops.sgl_kernel.weight_packed_linear(
+                x, layer.weight, bias, True  # is_vnni
+            )
+
         return F.linear(x, layer.weight, bias)
 
 

@@ -454,11 +454,20 @@ def _get_logits(
             dp_gather_replicate(hidden_states, local_hidden_states, logits_metadata)
 
         if hasattr(lm_head, "weight"):
-            logits = torch.matmul(
-                hidden_states.to(lm_head.weight.dtype), lm_head.weight.T
-            )
+            if lm_head.use_intel_amx_backend:
+                logits = torch.ops.sgl_kernel.weight_packed_linear(
+                    hidden_states.to(lm_head.weight.dtype),
+                    lm_head.weight,
+                    None,  # bias
+                    True,  # is_vnni
+                )
+            else:
+                logits = torch.matmul(
+                    hidden_states.to(lm_head.weight.dtype), lm_head.weight.T
+                )
         else:
             # GGUF models
+            # TODO: use weight_packed_linear for GGUF models
             logits = lm_head.quant_method.apply(lm_head, hidden_states, embedding_bias)
 
         if self.logit_scale is not None:

@@ -18,7 +18,12 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs
+from sglang.srt.utils import (
+    _process_weight_after_loading,
+    get_bool_env_var,
+    is_hip,
+    set_weight_attrs,
+)
 
 if torch.cuda.is_available():
     from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
@@ -115,6 +120,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
+
+        _process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
+
         return
 
     def apply(
@@ -236,18 +244,58 @@ def forward_cpu(
         correction_bias: Optional[torch.Tensor] = None,
         inplace: bool = True,
     ) -> torch.Tensor:
-        return moe_forward_native(
-            layer,
-            x,
-            use_grouped_topk,
-            top_k,
-            router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            custom_routing_function,
-            correction_bias,
-        )
+        assert activation == "silu", f"activation = {activation} is not supported."
+
+        # TODO: rebase after #6441 lands
+        if layer.use_intel_amx_backend:
+        # if cpu_has_amx_support(): ---> #6441
+            topk_weights, topk_ids = select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                use_grouped_topk=use_grouped_topk,
+                top_k=top_k,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                custom_routing_function=custom_routing_function,
+                correction_bias=correction_bias,
+                routed_scaling_factor=routed_scaling_factor,
+            )
+
+            return torch.ops.sgl_kernel.fused_experts_cpu(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                True,  # inplace
+                False,  # use_int8_w8a8
+                False,  # use_fp8_w8a16
+                None,  # w1_scale
+                None,  # w2_scale
+                None,  # block_size
+                None,  # a1_scale
+                None,  # a2_scale
+                True,  # is_vnni
+            )
+        else:
+            return moe_forward_native(
+                layer,
+                x,
+                use_grouped_topk,
+                top_k,
+                router_logits,
+                renormalize,
+                topk_group,
+                num_expert_group,
+                custom_routing_function,
+                correction_bias,
+                activation,
+                apply_router_weight_on_input,
+                inplace,
+                no_combine,
+                routed_scaling_factor,
+            )
 
     def forward_tpu(self, *args, **kwargs) -> torch.Tensor:
         raise NotImplementedError("The TPU backend currently does not support MoE.")

@@ -549,6 +549,10 @@ def __init__(
             use_presharded_weights=use_presharded_weights,
         )
         self.quant_config = quant_config
+
+        from sglang.srt.utils import PackWeightMethod
+
+        self.quant_method = PackWeightMethod(weight_names=["weight"])
         if bias:
             self.bias = Parameter(
                 torch.empty(self.num_embeddings_per_partition, dtype=params_dtype)

diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -90,6 +90,7 @@
 from sglang.srt.utils import (
     BumpAllocator,
     DeepEPMode,
+    PackWeightMethod,
     add_prefix,
     get_bool_env_var,
     get_int_env_var,
@@ -201,8 +202,17 @@ def __init__(
             )
         else:
             self.e_score_correction_bias = None
+        self.quant_method = PackWeightMethod(weight_names=["weight"])
 
     def forward(self, hidden_states):
+        if self.use_intel_amx_backend:
+            return torch.ops.sgl_kernel.weight_packed_linear(
+                hidden_states,
+                self.weight,
+                None,  # bias
+                True,  # is_vnni
+            )
+
         logits = F.linear(hidden_states, self.weight, None)
         return logits
 

diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
@@ -2169,3 +2169,32 @@ def with_value(self, new_value: T):
         finally:
             assert self._value is new_value
             self._value = None
+
+
+def _process_weight_after_loading(module, weight_names) -> None:
+    # Pack weight for get better performance on CPU
+    devices = {getattr(module, weight_name).device for weight_name in weight_names}
+    assert len(devices) == 1, f"Expects all weights to be on the same device"
+    device = devices.pop()
+
+    for weight_name in weight_names:
+        setattr(
+            module,
+            weight_name,
+            torch.nn.Parameter(
+                prepack_weight_if_needed(getattr(module, weight_name)),
+                requires_grad=False,
+            ),
+        )
+
+    module.use_intel_amx_backend = (
+        device == torch.device("cpu") and cpu_has_amx_support()
+    )
+
+
+class PackWeightMethod:
+    def __init__(self, weight_names):
+        self.weight_names = weight_names
+
+    def process_weights_after_loading(self, module) -> None:
+        _process_weight_after_loading(module, self.weight_names)