Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,9 @@ def batched_fused_marlin_moe(
class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
def __init__(self, quant_config: FusedMoEQuantConfig):
# TODO (varun) : Enable activation quantization
assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, (
"Supports only mxfp4_w4a16 or int4_w4a16"
)
super().__init__(quant_config)

def moe_problem_size(
Expand Down Expand Up @@ -616,7 +618,11 @@ def apply(
gating_output=None,
topk_weights=topk_weights,
topk_ids=topk_ids,
quant_type_id=scalar_types.float4_e2m1f.id, # works only for w4a16
quant_type_id=(
scalar_types.uint4b8.id
if self.quant_config.use_int4_w4a16
else scalar_types.float4_e2m1f.id
), # works only for w4a16
apply_router_weight_on_input=apply_router_weight_on_input,
global_num_experts=global_num_experts,
activation=activation,
Expand Down Expand Up @@ -720,8 +726,11 @@ def apply(
w1_scale=self.w1_scale,
w2_scale=self.w2_scale,
gating_output=None,
quant_type_id=scalar_types.float4_e2m1f.id, # works only for w4a16
apply_router_weight_on_input=apply_router_weight_on_input,
quant_type_id=(
scalar_types.uint4b8.id
if self.quant_config.use_int4_w4a16
else scalar_types.float4_e2m1f.id
), # works only for w4a16
global_num_experts=global_num_experts,
activation=activation,
expert_map=expert_map,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
is_valid_flashinfer_cutlass_fused_moe,
)
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
BatchedMarlinExperts,
MarlinExperts,
fused_marlin_moe,
)
from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa
WNA16_SUPPORTED_BITS,
WNA16_SUPPORTED_TYPES_MAP,
Expand Down Expand Up @@ -1562,7 +1566,36 @@
def get_fused_moe_quant_config(
self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
return None
if self.num_bits != 4:
return None
return int4_w4a16_moe_quant_config(
w1_scale=layer.w13_weight_scale,
w2_scale=layer.w2_weight_scale,
w1_zp=None,
w2_zp=None,
block_shape=[0, self.group_size],
)

def select_gemm_impl(
self,
prepare_finalize: mk.FusedMoEPrepareAndFinalize,
layer: torch.nn.Module,
) -> mk.FusedMoEPermuteExpertsUnpermute:
layer.w13_weight = layer.w13_weight_packed
layer.w2_weight = layer.w2_weight_packed
assert all([w is not None for w in [layer.w13_weight, layer.w2_weight]])
assert self.moe_quant_config is not None
if (
prepare_finalize.activation_format
== mk.FusedMoEActivationFormat.BatchedExperts
):
return BatchedMarlinExperts(
max_num_tokens=prepare_finalize.max_num_tokens_per_rank(),
num_dispatchers=prepare_finalize.num_dispatchers(),
quant_config=self.moe_quant_config,
Comment on lines 1578 to 1614

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Lose act-order indices when routing through modular Marlin experts

The new select_gemm_impl now returns BatchedMarlinExperts/MarlinExperts for CompressedTensorsWNA16MarlinMoEMethod, but those experts never forward the g_idx* and sort_indices* tensors that are passed to fused_marlin_moe in the non-modular path (apply still calls fused_marlin_moe(..., g_idx1=..., g_idx2=..., sort_indices1=..., sort_indices2=...)). For models quantized with grouped activation ordering (which populate these tensors during process_weights_after_loading), the modular kernel used for DP/EP will silently drop the act-order permutation information, causing incorrect expert outputs when the DeepEP/prepare-finalize path is enabled. Consider wiring the g‑index tensors through the modular Marlin experts or gating the modular path off for act-ordered weights.

Useful? React with 👍 / 👎.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@luccafong - This call out seems reasonable. Looks like you'd need to plumb through g_idx1 , g_idx2 , sort_indices1 and sort_indices2 ? Can you please take a look. Thanks.

Copy link
Collaborator Author

@luccafong luccafong Nov 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, seems we will need to touch the base method signature of FusedMoEPrepareAndFinalize to add them, or we init them in MarlinExperts,

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let me try the later approach

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

resolved in 1e18fc8

)
else:
return MarlinExperts(self.moe_quant_config)

def apply(
self,
Expand All @@ -1573,7 +1606,7 @@
renormalize: bool,
use_grouped_topk: bool = False,
topk_group: int | None = None,
num_expert_group: int | None = None,

Check failure on line 1609 in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

View workflow job for this annotation

GitHub Actions / pre-commit

Argument "max_num_tokens" to "BatchedMarlinExperts" has incompatible type "int | None"; expected "int" [arg-type]
global_num_experts: int = -1,
expert_map: torch.Tensor | None = None,
custom_routing_function: Callable | None = None,
Expand Down
Loading