Skip to content
Merged
3 changes: 2 additions & 1 deletion benchmarks/kernels/benchmark_cutlass_moe_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from tests.kernels.moe.utils import make_dummy_moe_config
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
Expand Down Expand Up @@ -161,7 +162,7 @@ def bench_run(
w2_fp8q_cutlass,
topk_weights,
topk_ids,
activation="silu",
activation=MoEActivation.SILU,
global_num_experts=num_experts,
)
torch.cuda.synchronize()
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/kernels/benchmark_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ray.experimental.tqdm_ray import tqdm

from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEParallelConfig,
Expand Down Expand Up @@ -211,7 +212,8 @@ def run():
hidden_dim=hidden_size,
intermediate_size_per_partition=shard_intermediate_size,
num_local_experts=num_experts,
activation="silu",
num_logical_experts=num_experts,
activation=MoEActivation.SILU,
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
in_dtype=init_dtype,
routing_method=RoutingMethodType.TopK,
Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/moe/modular_kernel_tools/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
)
from vllm.forward_context import set_forward_context
from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
Expand Down Expand Up @@ -599,7 +600,7 @@ def next_power_of_2(x):
moe_parallel_config=moe_parallel_config,
in_dtype=config.dtype,
max_num_tokens=next_power_of_2(config.M),
activation="silu",
activation=MoEActivation.SILU,
device=vllm_config.device_config.device,
routing_method=RoutingMethodType.DeepSeekV3,
)
Expand Down
9 changes: 5 additions & 4 deletions tests/kernels/moe/test_cpu_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.cpu_fused_moe import _CPU_MOE_ACT_FN
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
Expand All @@ -19,7 +20,7 @@
HIDDEN_DIM = [128, 2880]
INTERMEDIATE_DIM = [128, 2880]
BATCH_SIZE = [1, 64, 256]
ACT = ["silu", "swigluoai"]
ACT = [MoEActivation.SILU, MoEActivation.SWIGLUOAI]
USE_BIAS = [True, False]
ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
DTYPE = [torch.bfloat16]
Expand All @@ -33,7 +34,7 @@ def ref_fused_moe(
w2_bias: torch.Tensor | None,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
activation: str,
activation: MoEActivation,
) -> torch.Tensor:
len_experts = w13.size(0)

Expand Down Expand Up @@ -103,7 +104,7 @@ def test_cpu_fused_moe(
intermediate_size: int,
use_bias: bool,
dtype: torch.dtype,
act: str,
act: MoEActivation,
isa: str,
):
set_random_seed(0)
Expand Down Expand Up @@ -153,7 +154,7 @@ def test_cpu_fused_moe(
w2_bias,
topk_weight,
topk_ids,
act,
act.value,
isa,
)

Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/moe/test_cutlass_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import (
FUSED_MOE_UNQUANTIZED_CONFIG,
FusedMoEQuantConfig,
Expand Down Expand Up @@ -531,7 +532,7 @@ def test_run_cutlass_moe_fp8(
c_strides1 = torch.full((e,), 2 * n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((e,), k, device="cuda", dtype=torch.int64)

activation = "silu"
activation = MoEActivation.SILU
a1q, a1q_scale = moe_kernel_quantize_input(
mt.a, mt.a_scale, torch.float8_e4m3fn, per_act_token
)
Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/moe/test_deepep_deepgemm_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from vllm.config import VllmConfig, set_current_vllm_config
from vllm.forward_context import set_forward_context
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig,
fp8_w8a8_moe_quant_config,
Expand Down Expand Up @@ -324,7 +325,7 @@ def build_expert_map():
w2=w2,
topk_weights=test_tensors.topk_weights,
topk_ids=test_tensors.topk,
activation="silu",
activation=MoEActivation.SILU,
global_num_experts=num_experts,
expert_map=build_expert_map(),
apply_router_weight_on_input=False,
Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/moe/test_deepep_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import TritonExperts
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig,
)
Expand Down Expand Up @@ -260,7 +261,7 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
w2=w2,
topk_weights=topk_weights_chunk,
topk_ids=topk_chunk,
activation="silu",
activation=MoEActivation.SILU,
global_num_experts=num_experts,
expert_map=build_expert_map(),
apply_router_weight_on_input=False,
Expand Down
20 changes: 12 additions & 8 deletions tests/kernels/moe/test_flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEParallelConfig,
Expand Down Expand Up @@ -93,9 +94,14 @@ class TestData:

@staticmethod
def make_moe_tensors_8bit(
m: int, k: int, n: int, e: int, is_trtllm: bool, activation: str = "silu"
m: int,
k: int,
n: int,
e: int,
is_trtllm: bool,
activation: MoEActivation = MoEActivation.SILU,
) -> "TestData":
is_gated = activation != "relu2_no_mul"
is_gated = activation.is_gated

hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
w13 = torch.randn(
Expand Down Expand Up @@ -194,7 +200,7 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=False,
activation="silu",
activation=MoEActivation.SILU,
global_num_experts=e,
expert_map=None,
apply_router_weight_on_input=True,
Expand All @@ -219,21 +225,19 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("activation", ["silu", "relu2_no_mul"])
@pytest.mark.parametrize("activation", [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL])
def test_flashinfer_cutlass_moe_fp8_no_graph(
m: int,
n: int,
k: int,
e: int,
topk: int,
activation: str,
activation: MoEActivation,
monkeypatch,
workspace_init,
):
set_random_seed(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
assert activation in ["silu", "relu2_no_mul"]
is_act_and_mul = activation == "silu_and_mul"
with set_current_vllm_config(vllm_config):
td = TestData.make_moe_tensors_8bit(
m, k, n, e, is_trtllm=False, activation=activation
Expand Down Expand Up @@ -292,7 +296,7 @@ def get_fused_moe_quant_config(n: torch.nn.Module) -> FusedMoEQuantConfig:
device="cuda",
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
in_dtype=torch.bfloat16,
is_act_and_mul=is_act_and_mul,
is_act_and_mul=activation.is_gated,
routing_method=RoutingMethodType.TopK,
)

Expand Down
11 changes: 5 additions & 6 deletions tests/kernels/moe/test_flashinfer_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEParallelConfig,
Expand Down Expand Up @@ -54,7 +55,7 @@
@pytest.mark.parametrize("e", [40, 64, 256])
@pytest.mark.parametrize("topk", [1, 6, 8])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("activation", ["silu_and_mul", "relu2"])
@pytest.mark.parametrize("activation", [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL])
@torch.inference_mode()
def test_flashinfer_fp4_moe_no_graph(
m: int,
Expand All @@ -63,7 +64,7 @@ def test_flashinfer_fp4_moe_no_graph(
e: int,
topk: int,
dtype: torch.dtype,
activation: str,
activation: MoEActivation,
workspace_init,
):
set_random_seed(7)
Expand All @@ -73,7 +74,7 @@ def test_flashinfer_fp4_moe_no_graph(
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10

quant_blocksize = 16
is_gated_act = activation == "silu_and_mul"
is_gated_act = activation.is_gated

w1_q, w2_q, quant_config = make_test_quant_config(
e,
Expand Down Expand Up @@ -112,15 +113,13 @@ def test_flashinfer_fp4_moe_no_graph(
inplace=False,
)

fi_activation = {"silu_and_mul": "silu", "relu2": "relu2_no_mul"}[activation]

flashinfer_output = flashinfer_experts(
hidden_states=a,
w1=w1_q,
w2=w2_q,
topk_weights=topk_weights,
topk_ids=topk_ids,
activation=fi_activation,
activation=activation,
)

# Reference check:
Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/moe/test_modular_oai_triton_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pytest
import torch

from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.utils.import_utils import has_triton_kernels

if not has_triton_kernels():
Expand Down Expand Up @@ -192,7 +193,7 @@ def oai_triton_moe_impl(
w2=w2,
topk_weights=topk_weights,
topk_ids=topk_ids,
activation="swigluoai",
activation=MoEActivation.SWIGLUOAI,
global_num_experts=num_experts,
expert_map=None,
apply_router_weight_on_input=False,
Expand Down
24 changes: 19 additions & 5 deletions tests/kernels/moe/test_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from vllm.distributed.parallel_state import init_distributed_environment
from vllm.forward_context import get_forward_context, set_forward_context
from vllm.model_executor.layers.fused_moe import (
MoEActivation,
fused_topk,
)
from vllm.model_executor.layers.fused_moe.config import (
Expand Down Expand Up @@ -1155,7 +1156,10 @@ def test_fused_marlin_moe_with_bias(m):
@pytest.mark.parametrize("m", [1, 64, 256])
@pytest.mark.parametrize("n,k", [(1024, 1024), (2048, 2048)])
@pytest.mark.parametrize("e,topk", [(8, 2), (64, 4)])
def test_fused_marlin_moe_non_gated(m: int, n: int, k: int, e: int, topk: int):
@pytest.mark.parametrize("activation", [MoEActivation.RELU2_NO_MUL])
def test_fused_marlin_moe_non_gated(
m: int, n: int, k: int, e: int, topk: int, activation: MoEActivation
):
"""Test Marlin MoE with non-gated activation (relu2_no_mul).

Non-gated activations like relu2 don't have the gate-up projection pattern,
Expand Down Expand Up @@ -1198,7 +1202,7 @@ def test_fused_marlin_moe_non_gated(m: int, n: int, k: int, e: int, topk: int):
w2_data.w_ref,
score,
topk,
activation="relu2",
activation=activation,
)

marlin_output = fused_marlin_moe(
Expand All @@ -1223,7 +1227,7 @@ def test_fused_marlin_moe_non_gated(m: int, n: int, k: int, e: int, topk: int):
w2_zeros=w2_data.zeros,
quant_type_id=quant_type.id,
is_k_full=is_k_full,
activation="relu2_no_mul",
activation=activation,
)

torch.testing.assert_close(marlin_output, torch_output, atol=1e-1, rtol=0)
Expand Down Expand Up @@ -1330,9 +1334,18 @@ def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype):
@pytest.mark.parametrize("topk", [2])
@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
@pytest.mark.parametrize("with_bias", [False, True])
@pytest.mark.parametrize("activation", ["silu"])
@pytest.mark.parametrize("activation", [MoEActivation.SILU])
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only test")
def test_cpu_fused_moe_basic(m, n, k, e, topk, dtype, with_bias, activation):
def test_cpu_fused_moe_basic(
m: int,
n: int,
k: int,
e: int,
topk: int,
dtype: torch.dtype,
with_bias: bool,
activation: MoEActivation,
):
from vllm.model_executor.layers.fused_moe.cpu_fused_moe import CPUFusedMOE

device = "cpu"
Expand Down Expand Up @@ -1608,6 +1621,7 @@ def test_unquantized_bf16_flashinfer_trtllm_backend(
hidden_dim=k,
intermediate_size_per_partition=n,
num_local_experts=e,
num_logical_experts=e,
activation="silu",
device="cuda",
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/moe/test_pplx_cutlass_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from vllm import _custom_ops as ops
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEParallelConfig,
Expand Down Expand Up @@ -149,7 +150,7 @@ def make_moe_config() -> FusedMoEConfig:
num_local_experts=num_local_experts,
num_logical_experts=num_experts,
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
activation="silu",
activation=MoEActivation.SILU,
in_dtype=torch.bfloat16,
device="cuda",
routing_method=RoutingMethodType.Llama4,
Expand Down
Loading