Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 201 additions & 0 deletions tests/kernels/moe/test_triton_moe_no_act_mul.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for MoE with non-gated activations (*_no_mul).

These tests verify that MoE layers work correctly with activations like
silu_no_mul, gelu_no_mul, relu2_no_mul where the activation output dimension
equals N (not N // 2 like gated activations).
"""

import pytest
import torch

from vllm.model_executor.layers.fused_moe.config import (
FUSED_MOE_UNQUANTIZED_CONFIG,
)
from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
from vllm.model_executor.layers.fused_moe.utils import (
GELU_NO_MUL,
RELU2_NO_MUL,
SILU_NO_MUL,
)
from vllm.platforms import current_platform

# Test parameters
M_SIZES = [1, 16, 64]
N_SIZES = [128, 256]
K_SIZES = [64, 128]
TOPK_VALUES = [1, 2]
NUM_EXPERTS = 8
NO_MUL_ACTIVATIONS = [SILU_NO_MUL, GELU_NO_MUL, RELU2_NO_MUL]


def make_test_tensors(
m: int,
n: int,
k: int,
num_experts: int,
topk: int,
dtype: torch.dtype = torch.bfloat16,
device: str = "cuda",
):
"""Create test tensors for MoE with non-gated activation.

For non-gated activations (*_no_mul):
- w1: (E, N, K) - projects from K to N
- w2: (E, K, N) - projects from N back to K (note: N, not N//2)
"""
hidden_states = torch.randn(m, k, dtype=dtype, device=device)

# For non-gated: w1 projects K -> N, w2 projects N -> K
w1 = torch.randn(num_experts, n, k, dtype=dtype, device=device) * 0.1
w2 = torch.randn(num_experts, k, n, dtype=dtype, device=device) * 0.1

topk_weights = torch.ones(m, topk, dtype=torch.float32, device=device) / topk
topk_ids = torch.randint(0, num_experts, (m, topk), device=device)

return hidden_states, w1, w2, topk_weights, topk_ids


@pytest.mark.skipif(
not current_platform.has_device_capability(80),
reason="Requires compute capability >= 8.0",
)
@pytest.mark.parametrize("m", M_SIZES)
@pytest.mark.parametrize("n", N_SIZES)
@pytest.mark.parametrize("k", K_SIZES)
@pytest.mark.parametrize("topk", TOPK_VALUES)
@pytest.mark.parametrize("activation", NO_MUL_ACTIVATIONS)
@torch.inference_mode()
def test_triton_experts_no_mul_activation(
m: int,
n: int,
k: int,
topk: int,
activation: str,
):
hidden_states, w1, w2, topk_weights, topk_ids = make_test_tensors(
m, n, k, NUM_EXPERTS, topk
)

experts = TritonExperts(FUSED_MOE_UNQUANTIZED_CONFIG)

ws1_shape, ws2_shape, out_shape = experts.workspace_shapes(
M=m,
N=n,
K=k,
topk=topk,
global_num_experts=NUM_EXPERTS,
local_num_experts=NUM_EXPERTS,
expert_tokens_meta=None,
activation=activation,
)

# Verify workspace shapes are correct for no_mul activation
# workspace1 should handle activation_out_dim = N (not N//2)
assert ws1_shape == (m, topk, max(n, k)), (
f"workspace1 shape mismatch: expected {(m, topk, max(n, k))}, got {ws1_shape}"
)
# workspace2 should handle max(N, K) for intermediate_cache1/cache3
assert ws2_shape == (m, topk, max(n, k)), (
f"workspace2 shape mismatch: expected {(m, topk, max(n, k))}, got {ws2_shape}"
)
assert out_shape == (m, k), (
f"output shape mismatch: expected {(m, k)}, got {out_shape}"
)

workspace1 = torch.empty(
ws1_shape[0] * ws1_shape[1] * ws1_shape[2],
dtype=hidden_states.dtype,
device=hidden_states.device,
)
workspace2 = torch.empty(
ws2_shape[0] * ws2_shape[1] * ws2_shape[2],
dtype=hidden_states.dtype,
device=hidden_states.device,
)
output = torch.zeros(m, k, dtype=hidden_states.dtype, device=hidden_states.device)

experts.apply(
output=output,
hidden_states=hidden_states,
w1=w1,
w2=w2,
topk_weights=topk_weights,
topk_ids=topk_ids,
activation=activation,
global_num_experts=NUM_EXPERTS,
expert_map=None,
a1q_scale=None,
a2_scale=None,
workspace13=workspace1,
workspace2=workspace2,
expert_tokens_meta=None,
apply_router_weight_on_input=False,
)

assert output.shape == (m, k), f"Expected shape {(m, k)}, got {output.shape}"
assert not torch.isnan(output).any(), "Output contains NaN"
assert not torch.isinf(output).any(), "Output contains Inf"
assert output.abs().sum() > 0, "Output is all zeros"


@pytest.mark.skipif(
not current_platform.has_device_capability(80),
reason="Requires compute capability >= 8.0",
)
@torch.inference_mode()
def test_workspace_shapes_no_mul_vs_gated():
"""Test that workspace shapes differ correctly between gated and non-gated."""
from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts

M, N, K, topk = 64, 256, 128, 2

experts = TritonExperts(FUSED_MOE_UNQUANTIZED_CONFIG)

ws1_no_mul, _, out_no_mul = experts.workspace_shapes(
M, N, K, topk, 8, 8, None, SILU_NO_MUL
)

ws1_gated, _, out_gated = experts.workspace_shapes(
M, N, K, topk, 8, 8, None, "silu"
)

# For no_mul: activation_out_dim = N
# For gated: activation_out_dim = N // 2
# workspace1 should use max(activation_out_dim, K)
activation_out_dim_no_mul = N
activation_out_dim_gated = N // 2

assert ws1_no_mul[2] == max(activation_out_dim_no_mul, K), (
f"no_mul workspace1 last dim should be max({activation_out_dim_no_mul}, {K})"
)
assert ws1_gated[2] == max(activation_out_dim_gated, K), (
f"gated workspace1 last dim should be max({activation_out_dim_gated}, {K})"
)

# Output shapes should be the same
assert out_no_mul == out_gated == (M, K)


@pytest.mark.skipif(
not current_platform.has_device_capability(80),
reason="Requires compute capability >= 8.0",
)
@torch.inference_mode()
def test_adjust_n_for_activation():
"""Test the adjust_N_for_activation method."""
from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts

experts = TritonExperts(FUSED_MOE_UNQUANTIZED_CONFIG)

N = 256

# Gated activations should return N // 2
assert experts.adjust_N_for_activation(N, "silu") == N // 2
assert experts.adjust_N_for_activation(N, "gelu") == N // 2

# Non-gated activations should return N
assert experts.adjust_N_for_activation(N, SILU_NO_MUL) == N
assert experts.adjust_N_for_activation(N, GELU_NO_MUL) == N
assert experts.adjust_N_for_activation(N, RELU2_NO_MUL) == N
Original file line number Diff line number Diff line change
Expand Up @@ -305,15 +305,17 @@ def workspace_shapes(
global_num_experts: int,
local_num_experts: int,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
activation: str,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
# FIXME (varun): We should be able to dispatch only from the leader
# DP ranks in the case of TP > 1. At the moment, all the Ranks
# end up sending their tokens. This needs to be fixed.
num_dispatchers = self.num_dispatchers
num_experts = local_num_experts
max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens
activation_out_dim = self.adjust_N_for_activation(N, activation)
workspace13 = (num_experts, max_num_tokens * num_dispatchers, max(K, N))
workspace2 = (num_experts, max_num_tokens * num_dispatchers, (N // 2))
workspace2 = (num_experts, max_num_tokens * num_dispatchers, activation_out_dim)
output = (num_experts, max_num_tokens * num_dispatchers, K)
return (workspace13, workspace2, output)

Expand Down
20 changes: 16 additions & 4 deletions vllm/model_executor/layers/fused_moe/cutlass_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,11 @@ def workspace_shapes(
global_num_experts: int,
local_num_experts: int,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
activation: str,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
activation_out_dim = self.adjust_N_for_activation(N, activation)
workspace1 = (M * topk, max(N, K))
workspace2 = (M * topk, max(N // 2, K))
workspace2 = (M * topk, max(activation_out_dim, K))
output = (M, K)
return (workspace1, workspace2, output)

Expand Down Expand Up @@ -402,11 +404,17 @@ def workspace_shapes(
global_num_experts: int,
local_num_experts: int,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
activation: str,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
num_dp = self.num_dispatchers
assert num_dp is not None
activation_out_dim = self.adjust_N_for_activation(N, activation)
workspace1 = (self.max_experts_per_worker, M * num_dp, max(N, K))
workspace2 = (self.max_experts_per_worker, M * num_dp, max(N // 2, K))
workspace2 = (
self.max_experts_per_worker,
M * num_dp,
max(activation_out_dim, K),
)
output = (self.max_experts_per_worker, M, K)
return (workspace1, workspace2, output)

Expand Down Expand Up @@ -635,13 +643,15 @@ def workspace_shapes(
global_num_experts: int,
local_num_experts: int,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
activation: str,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
activation_out_dim = self.adjust_N_for_activation(N, activation)
workspace1: tuple[int, ...] = ()
workspace2: tuple[int, ...] = ()
output: tuple[int, ...] = ()
if self.use_batched_format:
workspace1 = (self.max_experts_per_worker, M, max(N, K))
workspace2 = (self.max_experts_per_worker, M, (N // 2))
workspace2 = (self.max_experts_per_worker, M, activation_out_dim)
output = (self.max_experts_per_worker, M, K)
else:
workspace1 = (M * topk, max(2 * N, K))
Expand Down Expand Up @@ -896,9 +906,11 @@ def workspace_shapes(
global_num_experts: int,
local_num_experts: int,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
activation: str,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
activation_out_dim = self.adjust_N_for_activation(N, activation)
workspace1 = (M * topk, max(N, K))
workspace2 = (M * topk, max(N // 2, K))
workspace2 = (M * topk, max(activation_out_dim, K))
output = (M, K)
return (workspace1, workspace2, output)

Expand Down
18 changes: 12 additions & 6 deletions vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def workspace_shapes(
global_num_experts: int,
local_num_experts: int,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
activation: str,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
assert self.block_shape is not None
block_m = self.block_shape[0]
Expand All @@ -151,7 +152,8 @@ def workspace_shapes(
)
assert M_sum % block_m == 0

workspace1 = (M_sum, max(N // 2, K))
activation_out_dim = self.adjust_N_for_activation(N, activation)
workspace1 = (M_sum, max(activation_out_dim, K))
workspace2 = (M_sum, max(N, K))
output = (M, K)
return (workspace1, workspace2, output)
Expand All @@ -163,11 +165,13 @@ def _act_mul_quant(
block_k = self.block_shape[1]
scale_fmt = DeepGemmQuantScaleFMT.from_oracle()

M_sum, N = input.size()
activation_out_dim = self.adjust_N_for_activation(N, activation)

# 1. DeepGemm UE8M0: use packed per-token-group quant
if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
M_sum, N = input.size()
act_out = torch.empty(
(M_sum, N // 2), dtype=input.dtype, device=input.device
(M_sum, activation_out_dim), dtype=input.dtype, device=input.device
)
self.activation(activation, act_out, input)
a2q, a2q_scale = per_token_group_quant_fp8_packed_for_deepgemm(
Expand All @@ -187,8 +191,9 @@ def _act_mul_quant(
)

# 3. fallback path for non-SiLU activations in non‑UE8M0 cases.
M_sum, N = input.size()
act_out = torch.empty((M_sum, N // 2), dtype=input.dtype, device=input.device)
act_out = torch.empty(
(M_sum, activation_out_dim), dtype=input.dtype, device=input.device
)
self.activation(activation, act_out, input)
return per_token_group_quant_fp8(
act_out, block_k, column_major_scales=True, out_q=output
Expand Down Expand Up @@ -254,8 +259,9 @@ def apply(
(a1q, a1q_scale), (w1, self.w1_scale), mm1_out, expert_ids
)

activation_out_dim = self.adjust_N_for_activation(N, activation)
quant_out = _resize_cache(
workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, N // 2)
workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, activation_out_dim)
)
a2q, a2q_scale = self._act_mul_quant(
input=mm1_out.view(-1, N), output=quant_out, activation=activation
Expand Down
1 change: 1 addition & 0 deletions vllm/model_executor/layers/fused_moe/fallback.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def workspace_shapes(
global_num_experts: int,
local_num_experts: int,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
activation: str,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
raise NotImplementedError

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def workspace_shapes(
global_num_experts: int,
local_num_experts: int,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
activation: str,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
# We use global_num_experts due to how moe_align_block_size handles
# expert_maps.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def workspace_shapes(
global_num_experts: int,
local_num_experts: int,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
activation: str,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
# We use global_num_experts due to how moe_align_block_size handles
# expert_maps.
Expand Down
Loading