sgl-project · zhyncs · Jun 25, 2025 · Mar 12, 2025 · Mar 14, 2025 · Mar 19, 2025
@@ -30,7 +30,12 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import (
+    _process_weight_after_loading,
+    cpu_has_amx_support,
+    is_cpu,
+    set_weight_attrs,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -52,6 +57,9 @@
     "IPEXAWQLinearMethod",
 ]
 
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+
 
 def adjust_marlin_shard(param, shard_size, shard_offset):
     marlin_tile_size = getattr(param, "marlin_tile_size", None)
@@ -165,13 +173,22 @@ def create_weights(
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, extra_weight_attrs)
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _is_cpu and _is_cpu_amx_available:
+            _process_weight_after_loading(layer, ["weight"])
+
     def apply(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
+        if getattr(layer, "use_intel_amx_backend", False):
+            return torch.ops.sgl_kernel.weight_packed_linear(
+                x, layer.weight, bias, True  # is_vnni
+            )
+
         return F.linear(x, layer.weight, bias)
 
 

@@ -442,11 +442,20 @@ def _get_logits(
             dp_gather_replicate(hidden_states, local_hidden_states, logits_metadata)
 
         if hasattr(lm_head, "weight"):
-            logits = torch.matmul(
-                hidden_states.to(lm_head.weight.dtype), lm_head.weight.T
-            )
+            if getattr(lm_head, "use_intel_amx_backend", False):
+                logits = torch.ops.sgl_kernel.weight_packed_linear(
+                    hidden_states.to(lm_head.weight.dtype),
+                    lm_head.weight,
+                    None,  # bias
+                    True,  # is_vnni
+                )
+            else:
+                logits = torch.matmul(
+                    hidden_states.to(lm_head.weight.dtype), lm_head.weight.T
+                )
         else:
             # GGUF models
+            # TODO: use weight_packed_linear for GGUF models
             logits = lm_head.quant_method.apply(lm_head, hidden_states, embedding_bias)
 
         if self.logit_scale is not None:

@@ -77,8 +77,15 @@ def moe_forward_native(
     custom_routing_function: Optional[Callable] = None,
     correction_bias: Optional[torch.Tensor] = None,
     activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    inplace: bool = True,
+    no_combine: bool = False,
     routed_scaling_factor: Optional[float] = None,
 ) -> torch.Tensor:
+
+    if apply_router_weight_on_input:
+        raise NotImplementedError()
+
     topk_weights, topk_ids = select_experts(
         hidden_states=x,
         router_logits=router_logits,

@@ -18,7 +18,14 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs
+from sglang.srt.utils import (
+    _process_weight_after_loading,
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_hip,
+    set_weight_attrs,
+)
 
 if torch.cuda.is_available():
     from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
@@ -28,6 +35,8 @@
 import logging
 
 _is_hip = is_hip()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 
 if _use_aiter:
@@ -117,6 +126,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
+
+        # Pack weight for get better performance on CPU
+        if _is_cpu and _is_cpu_amx_available:
+            _process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
+
         return
 
     def apply(
@@ -248,19 +262,64 @@ def forward_cpu(
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
     ) -> torch.Tensor:
-        return moe_forward_native(
-            layer,
-            x,
-            use_grouped_topk,
-            top_k,
-            router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            num_fused_shared_experts,
-            custom_routing_function,
-            correction_bias,
-        )
+        assert activation == "silu", f"activation = {activation} is not supported."
+
+        if (
+            getattr(layer, "use_intel_amx_backend", False)
+            and not apply_router_weight_on_input
+        ):
+            topk_weights, topk_ids = select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                use_grouped_topk=use_grouped_topk,
+                top_k=top_k,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                num_fused_shared_experts=num_fused_shared_experts,
+                custom_routing_function=custom_routing_function,
+                correction_bias=correction_bias,
+                routed_scaling_factor=routed_scaling_factor,
+            )
+
+            # TODO: support apply_router_weight_on_input in the fused_experts_cpu kernel
+            return torch.ops.sgl_kernel.fused_experts_cpu(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights.to(
+                    torch.float
+                ),  # TODO: the topk_weights of llama4 is computed via Llama4MoE:custom_routing_function and is bfloat16 while the kernel requires it to be float32
+                topk_ids,
+                True,  # inplace
+                False,  # use_int8_w8a8
+                False,  # use_fp8_w8a16
+                None,  # w1_scale
+                None,  # w2_scale
+                None,  # block_size
+                None,  # a1_scale
+                None,  # a2_scale
+                True,  # is_vnni
+            )
+        else:
+            return moe_forward_native(
+                layer,
+                x,
+                use_grouped_topk,
+                top_k,
+                router_logits,
+                renormalize,
+                topk_group,
+                num_expert_group,
+                num_fused_shared_experts,
+                custom_routing_function,
+                correction_bias,
+                activation,
+                apply_router_weight_on_input,
+                inplace,
+                no_combine,
+                routed_scaling_factor,
+            )
 
     def forward_tpu(self, *args, **kwargs) -> torch.Tensor:
         raise NotImplementedError("The TPU backend currently does not support MoE.")

@@ -20,10 +20,18 @@
     QuantizeMethodBase,
     method_has_implemented_embedding,
 )
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import (
+    PackWeightMethod,
+    cpu_has_amx_support,
+    is_cpu,
+    set_weight_attrs,
+)
 
 DEFAULT_VOCAB_PADDING_SIZE = 64
 
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+
 
 class UnquantizedEmbeddingMethod(QuantizeMethodBase):
     """Unquantized method for embeddings."""
@@ -549,6 +557,11 @@ def __init__(
             use_presharded_weights=use_presharded_weights,
         )
         self.quant_config = quant_config
+
+        # We only support pack LMHead if it's not quantized. For LMHead with quant_config, the weight_name will be "qweight"
+        if self.quant_config is None and _is_cpu and _is_cpu_amx_available:
+            self.quant_method = PackWeightMethod(weight_names=["weight"])
+
         if bias:
             self.bias = Parameter(
                 torch.empty(self.num_embeddings_per_partition, dtype=params_dtype)