Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,11 @@ def _setup_kernel(
replace_parameter(layer, "w13_weight", w13_new, prefer_copy=is_weight_update)
replace_parameter(layer, "w2_weight", w2_new, prefer_copy=is_weight_update)

# AITER backend requires weights to be marked as shuffled.
if self.unquantized_backend == UnquantizedMoeBackend.AITER:
layer.w13_weight.is_shuffled = True
layer.w2_weight.is_shuffled = True

if not is_weight_update:
# Setup moe kernel only on the first call. For the unquantized
# method, moe_quant_config is either the constant
Expand Down
6 changes: 6 additions & 0 deletions vllm/model_executor/layers/quantization/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
)
from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
from vllm.model_executor.layers.fused_moe.oracle.fp8 import (
Fp8MoeBackend,
convert_to_fp8_moe_kernel_format,
make_fp8_moe_kernel,
make_fp8_moe_quant_config,
Expand Down Expand Up @@ -766,6 +767,11 @@ def _setup_kernel(
replace_parameter(layer, f"w13_{self.weight_scale_name}", w13_scale)
replace_parameter(layer, f"w2_{self.weight_scale_name}", w2_scale)

# AITER backend requires weights to be marked as shuffled.
if self.fp8_backend == Fp8MoeBackend.AITER:
layer.w13_weight.is_shuffled = True
layer.w2_weight.is_shuffled = True

self.moe_quant_config = self.get_fused_moe_quant_config(layer)
if self.moe_quant_config:
assert self.experts_cls is not None
Expand Down
10 changes: 10 additions & 0 deletions vllm/model_executor/layers/quantization/mxfp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,11 @@ def _setup_kernel(
self.w13_precision_config = w13_scale
self.w2_precision_config = w2_scale

# AITER backend requires weights to be marked as shuffled.
if self.mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
layer.w13_weight.is_shuffled = True
layer.w2_weight.is_shuffled = True

if w13_bias is not None and w2_bias is not None:
replace_parameter(layer, "w13_bias", w13_bias)
replace_parameter(layer, "w2_bias", w2_bias)
Expand Down Expand Up @@ -678,6 +683,11 @@ def _setup_kernel(
self.w13_precision_config = w13_scale
self.w2_precision_config = w2_scale

# AITER backend requires weights to be marked as shuffled.
if self.mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
layer.w13_weight.is_shuffled = True
layer.w2_weight.is_shuffled = True

if w13_bias is not None and w2_bias is not None:
replace_parameter(layer, "w13_bias", w13_bias)
replace_parameter(layer, "w2_bias", w2_bias)
Expand Down
Loading