Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 29 additions & 24 deletions tests/compile/distributed/test_async_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
ModelConfig,
PassConfig,
VllmConfig,
set_current_vllm_config,
)
from vllm.distributed import (
tensor_model_parallel_all_gather,
Expand Down Expand Up @@ -340,38 +341,42 @@ def async_tp_pass_on_test_model(
)

async_tp_pass = AsyncTPPass(vllm_config)
backend = TestBackend(async_tp_pass)

assert (
async_tp_pass.compilation_config.splitting_ops
== vllm_config.compilation_config.splitting_ops
)
assert (
async_tp_pass.compilation_config.use_inductor_graph_partition
== vllm_config.compilation_config.use_inductor_graph_partition
)
# Set the global vllm_config for TestBackend which calls
# get_current_vllm_config()
with set_current_vllm_config(vllm_config):
backend = TestBackend(async_tp_pass)

model = test_model_cls(hidden_size, dtype) # Pass dtype to model constructor
assert (
async_tp_pass.compilation_config.splitting_ops
== vllm_config.compilation_config.splitting_ops
)
assert (
async_tp_pass.compilation_config.use_inductor_graph_partition
== vllm_config.compilation_config.use_inductor_graph_partition
)

hidden_states = torch.randn(
(batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
)
model = test_model_cls(hidden_size, dtype) # Pass dtype to model constructor

hidden_states = torch.randn(
(batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
)

if dynamic:
torch._dynamo.mark_dynamic(hidden_states, 0)
if dynamic:
torch._dynamo.mark_dynamic(hidden_states, 0)

compiled_model = torch.compile(model, backend=backend)
compiled_model(hidden_states)
compiled_model = torch.compile(model, backend=backend)
compiled_model(hidden_states)

assert async_tp_pass.matched_count == 1
assert async_tp_pass.matched_count == 1

# In pre-nodes, all gather or reduce scatter should exist,
# fused_matmul_reduce_scatter or fused_all_gather_matmul should not
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
# In pre-nodes, all gather or reduce scatter should exist,
# fused_matmul_reduce_scatter or fused_all_gather_matmul should not
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)

# In post-nodes, fused_matmul_reduce_scatter or \
# fused_all_gather_matmul should exist
backend.check_after_ops(model.ops_in_model_after())
# In post-nodes, fused_matmul_reduce_scatter or \
# fused_all_gather_matmul should exist
backend.check_after_ops(model.ops_in_model_after())


@create_new_process_for_each_test()
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def test_cudagraph_sizes_post_init(
)


def test_cached_compilation_config():
def test_cached_compilation_config(default_vllm_config):
import torch
from torch._inductor.utils import run_and_get_code

Expand Down
11 changes: 11 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,17 @@ def dist_init():
cleanup_dist_env_and_memory()


@pytest.fixture
def default_vllm_config():
"""Set a default VllmConfig for tests that directly test CustomOps or pathways
that use get_current_vllm_config() outside of a full engine context.
"""
from vllm.config import VllmConfig, set_current_vllm_config

with set_current_vllm_config(VllmConfig()):
yield


@pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool:
"""Allow subdirectories to skip global cleanup by overriding this fixture.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
)


def test_trtllm_attention_rejects_num_kv_heads_1() -> None:
def test_trtllm_attention_rejects_num_kv_heads_1(default_vllm_config) -> None:
"""Test that TRTLLM attention correctly rejects num_kv_heads=1.

When num_kv_heads=1 (MQA), the KV cache strides become degenerate
Expand Down
4 changes: 3 additions & 1 deletion tests/kernels/attention/test_mha_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def clear_cache():


@pytest.mark.parametrize("device", devices)
def test_mha_attn_platform(device: str):
def test_mha_attn_platform(default_vllm_config, device: str):
"""
Test the attention selector between different platform and device.
"""
Expand Down Expand Up @@ -116,6 +116,7 @@ def ref_attention(
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_mha_attn_forward(
default_vllm_config,
batch_size: int,
seq_len: int,
num_heads: int,
Expand Down Expand Up @@ -162,6 +163,7 @@ def test_mha_attn_forward(
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_mha_attn_varlen_forward(
default_vllm_config,
var_seq_len: list[int],
num_heads: int,
num_kv_heads: int,
Expand Down
2 changes: 2 additions & 0 deletions tests/kernels/core/test_activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_act_and_mul(
default_vllm_config,
activation: str,
num_tokens: int,
d: int,
Expand Down Expand Up @@ -122,6 +123,7 @@ def _get_rtol(output) -> float:
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_activation(
default_vllm_config,
activation: type[torch.nn.Module],
num_tokens: int,
d: int,
Expand Down
1 change: 1 addition & 0 deletions tests/kernels/core/test_fused_qk_norm_rope.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def _apply_qk_norm_rope(
@pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25])
@torch.inference_mode()
def test_fused_qk_norm_rope_matches_reference(
default_vllm_config,
device: str,
dtype: torch.dtype,
is_neox: bool,
Expand Down
1 change: 1 addition & 0 deletions tests/kernels/core/test_fused_quant_layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def ops_impl(
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_rms_norm(
default_vllm_config,
num_tokens: int,
hidden_size: int,
add_residual: bool,
Expand Down
1 change: 1 addition & 0 deletions tests/kernels/core/test_layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
@pytest.mark.parametrize("strided_input", [False, True])
@torch.inference_mode()
def test_rms_norm(
default_vllm_config,
num_tokens: int,
hidden_size: int,
add_residual: bool,
Expand Down
2 changes: 2 additions & 0 deletions tests/kernels/core/test_mrope.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ class MRoPETestInfo(NamedTuple):
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("num_tokens", num_tokens_list)
def test_mrope(
default_vllm_config,
model_name: str,
model_info: MRoPETestInfo,
tp_size: int,
Expand Down Expand Up @@ -159,6 +160,7 @@ def test_mrope(
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("num_tokens", num_tokens_list)
def test_mrope_torch_compile_tracing(
default_vllm_config,
model_name: str,
model_info: MRoPETestInfo,
tp_size: int,
Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/core/test_pos_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def _get_batch_tensor_shape(
@pytest.mark.parametrize("use_key", USE_KEY)
@torch.inference_mode()
def test_rotary_embedding(
default_vllm_config,
is_neox_style: bool,
tensor_shape_fn: Callable[[int, int, int, int], tuple[int, ...]],
batch_size: int,
Expand Down Expand Up @@ -123,7 +124,7 @@ def test_rotary_embedding(


@torch.inference_mode()
def test_rope_module_cache():
def test_rope_module_cache(default_vllm_config):
MAX_POSITIONS = [123, 1234]
ROPE_THETAS = [10000, 1000000]
ROPE_PARAMETERS = (
Expand Down
1 change: 1 addition & 0 deletions tests/kernels/core/test_rotary_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def rotary_embedding_opcheck(
@pytest.mark.parametrize("use_key", [True, False])
@pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
def test_rotary_embedding_opcheck(
default_vllm_config,
dist_init,
device,
max_position,
Expand Down
8 changes: 2 additions & 6 deletions tests/kernels/moe/test_cpu_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
from vllm.model_executor.layers.fused_moe.cpu_fused_moe import _CPU_MOE_ACT
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed

Expand All @@ -24,11 +24,6 @@
ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
DTYPE = [torch.bfloat16]

_CPU_MOE_ACT = {
"silu": SiluAndMul(),
"swigluoai": SwigluOAIAndMul(),
}


def ref_fused_moe(
input: torch.Tensor,
Expand Down Expand Up @@ -106,6 +101,7 @@ def ref_fused_moe(
@pytest.mark.parametrize("act", ACT)
@pytest.mark.parametrize("isa", ISA)
def test_cpu_fused_moe(
default_vllm_config,
batch_size: int,
expert_num: int,
hidden_size: int,
Expand Down
7 changes: 6 additions & 1 deletion tests/kernels/moe/test_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,12 @@ def test_fused_moe_wn16(
)
@torch.inference_mode()
def test_mixtral_moe(
dist_init, dtype: torch.dtype, padding: bool, use_rocm_aiter: bool, monkeypatch
default_vllm_config,
dist_init,
dtype: torch.dtype,
padding: bool,
use_rocm_aiter: bool,
monkeypatch,
):
"""Make sure our Mixtral MoE implementation agrees with the one from
huggingface."""
Expand Down
13 changes: 10 additions & 3 deletions tests/kernels/quantization/test_fp8_quant_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@
@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
def test_quantfp8_group_functionality(
batch_size: int, hidden_dim: int, group_size: int, seed: int, use_ue8m0: bool
default_vllm_config,
batch_size: int,
hidden_dim: int,
group_size: int,
seed: int,
use_ue8m0: bool,
) -> None:
"""Test QuantFP8 group quantization with various configurations.

Expand Down Expand Up @@ -82,7 +87,9 @@ def test_quantfp8_group_functionality(
@pytest.mark.parametrize("seed", [42])
@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
def test_quantfp8_group_multidimensional(
default_vllm_config, seed: int, use_ue8m0: bool
) -> None:
set_random_seed(seed)

group_size = 64
Expand Down Expand Up @@ -135,7 +142,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:

@pytest.mark.parametrize("seed", [42])
@torch.inference_mode()
def test_quantfp8_group_edge_cases(seed: int) -> None:
def test_quantfp8_group_edge_cases(default_vllm_config, seed: int) -> None:
set_random_seed(seed)

batch_size = 16
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/quantization/test_int8_kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def setup_cuda():
itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS),
)
@torch.inference_mode()
def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
def test_w8a8_fp8_fused_moe(default_vllm_config, M, N, K, E, topk, dtype, seed):
torch.manual_seed(seed)
# Initialize int8 quantization parameters
factor_for_scale = 1e-2
Expand Down
1 change: 1 addition & 0 deletions tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
@pytest.mark.parametrize("shape", SHAPES)
@torch.inference_mode()
def test_silu_mul_nvfp4_quant(
default_vllm_config,
dtype: torch.dtype,
shape: tuple[int, int],
) -> None:
Expand Down
1 change: 1 addition & 0 deletions tests/kernels/test_fused_quant_activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_silu_and_mul(
default_vllm_config,
num_tokens: int,
hidden_size: int,
dtype: torch.dtype,
Expand Down
4 changes: 2 additions & 2 deletions tests/lora/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):


@pytest.fixture
def dummy_model() -> nn.Module:
def dummy_model(default_vllm_config) -> nn.Module:
model = DummyLoRAModel(
OrderedDict(
[
Expand Down Expand Up @@ -115,7 +115,7 @@ def dummy_model() -> nn.Module:


@pytest.fixture
def dummy_model_gate_up() -> nn.Module:
def dummy_model_gate_up(default_vllm_config) -> nn.Module:
model = DummyLoRAModel(
OrderedDict(
[
Expand Down
13 changes: 8 additions & 5 deletions tests/lora/test_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool:
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
@pytest.mark.parametrize("stage", STAGES)
def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
def test_embeddings(
default_vllm_config, dist_init, num_loras, device, vocab_size, stage
) -> None:
# For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
# device, see: https://github.com/triton-lang/triton/issues/2925
# Same below.
Expand Down Expand Up @@ -353,7 +355,7 @@ def create_random_embedding_layer():
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
@pytest.mark.parametrize("stage", STAGES)
def test_lm_head_logits_processor(
dist_init, num_loras, device, vocab_size, stage
default_vllm_config, dist_init, num_loras, device, vocab_size, stage
) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
Expand Down Expand Up @@ -470,6 +472,7 @@ def _pretest():
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
def test_linear_replicated(
default_vllm_config,
dist_init,
num_loras,
device,
Expand Down Expand Up @@ -580,7 +583,7 @@ def create_random_linear_replicated_layer():
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
def test_linear_parallel(
dist_init, num_loras, orientation, fully_shard, device, stage
default_vllm_config, dist_init, num_loras, orientation, fully_shard, device, stage
) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
Expand Down Expand Up @@ -705,7 +708,7 @@ def create_random_linear_parallel_layer():
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
def test_column_parallel_packed(
dist_init, num_loras, repeats, fully_shard, device, stage
default_vllm_config, dist_init, num_loras, repeats, fully_shard, device, stage
) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
Expand Down Expand Up @@ -851,7 +854,7 @@ class FakeConfig:
@pytest.mark.parametrize(
"seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))
)
def test_vocab_parallel_embedding_indices(tp_size, seed):
def test_vocab_parallel_embedding_indices(tp_size, seed, default_vllm_config):
random.seed(seed)
vocab_size = random.randint(4000, 64000)
added_vocab_size = random.randint(0, 1024)
Expand Down
Loading