enable modular experts for compressed tensor marlin wn16 moe

luccafong · luccafong · commit e4fff80780d4 · 2025-11-12T09:43:09.000-08:00
Signed-off-by: Lu Fang &lt;fanglu@fb.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -501,7 +501,9 @@ def batched_fused_marlin_moe(
 class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
     def __init__(self, quant_config: FusedMoEQuantConfig):
         # TODO (varun) : Enable activation quantization
-        assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
+        assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, (
+            "Supports only mxfp4_w4a16 or int4_w4a16"
+        )
         super().__init__(quant_config)
 
     def moe_problem_size(
@@ -720,8 +722,11 @@ def apply(
             w1_scale=self.w1_scale,
             w2_scale=self.w2_scale,
             gating_output=None,
-            quant_type_id=scalar_types.float4_e2m1f.id,  # works only for w4a16
-            apply_router_weight_on_input=apply_router_weight_on_input,
+            quant_type_id=(
+                scalar_types.uint4b8.id
+                if self.quant_config.use_int4_w4a16
+                else scalar_types.float4_e2m1f.id
+            ),  # works only for w4a16 or mxfp4_w4a16
             global_num_experts=global_num_experts,
             activation=activation,
             expert_map=expert_map,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -432,11 +432,21 @@ def apply(
             zero_expert_num=zero_expert_num,
             zero_expert_type=zero_expert_type,
         )
+        w1 = (
+            layer.w13_weight_packed
+            if hasattr(layer, "w13_weight_packed")
+            else layer.w13_weight
+        )
+        w2 = (
+            layer.w2_weight_packed
+            if hasattr(layer, "w2_weight_packed")
+            else layer.w2_weight
+        )
 
         result = self.fused_experts(
             hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
+            w1=w1,
+            w2=w2,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=self.allow_inplace,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -35,7 +35,11 @@
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
     is_valid_flashinfer_cutlass_fused_moe,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    BatchedMarlinExperts,
+    MarlinExperts,
+    fused_marlin_moe,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS,
     WNA16_SUPPORTED_TYPES_MAP,
@@ -1562,7 +1566,50 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        return None
+        assert self.num_bits == 4 or self.num_bits == 8
+        config_builder = (
+            int4_w4a16_moe_quant_config
+            if self.num_bits == 4
+            else int8_w8a16_moe_quant_config
+        )
+
+        return config_builder(
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            w1_zp=None,
+            w2_zp=None,
+            block_shape=[0, self.group_size],
+        )
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        if (
+            prepare_finalize.activation_format
+            == mk.FusedMoEActivationFormat.BatchedExperts
+        ):
+            return BatchedMarlinExperts(
+                max_num_tokens=prepare_finalize.max_num_tokens_per_rank(),
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+                quant_config=self.moe_quant_config,
+            )
+        else:
+            layer.w13_weight = (
+                self.w13_weight_triton_tensor
+                if layer.w13_weight is None
+                else layer.w13_weight
+            )
+            layer.w2_weight = (
+                self.w2_weight_triton_tensor
+                if layer.w2_weight is None
+                else layer.w2_weight
+            )
+            assert all([w is not None for w in [layer.w13_weight, layer.w2_weight]])
+
+            assert self.moe_quant_config is not None
+            return MarlinExperts(self.moe_quant_config)
 
     def apply(
         self,