From 16f8d487b74f797a1f2a2a7ea965ab1f3e04de20 Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Tue, 31 Mar 2026 20:35:54 -0600 Subject: [PATCH 1/4] :alembic: add fixture to patch attention backends test Signed-off-by: Joe Runde --- .../vllm_spyre_next/testing/models.py | 3 + .../vllm_spyre_next/testing/pytest_plugin.py | 73 +++++++++++++++++++ .../testing/upstream_tests.yaml | 21 +++++- 3 files changed, 93 insertions(+), 4 deletions(-) diff --git a/vllm_spyre_next/vllm_spyre_next/testing/models.py b/vllm_spyre_next/vllm_spyre_next/testing/models.py index 07581cb23..8c154cd7f 100644 --- a/vllm_spyre_next/vllm_spyre_next/testing/models.py +++ b/vllm_spyre_next/vllm_spyre_next/testing/models.py @@ -51,6 +51,8 @@ class AllowEntry: param_allows: Parameter combinations to allow (whitelist). If specified, only these parameter values will run. param_overrides: Parameter values to replace upstream defaults with. + fixture_names: Fixture names to inject for this test (e.g. "foo" for a + custom fixture that prints "hello world"). """ test: str @@ -59,6 +61,7 @@ class AllowEntry: param_skips: tuple[ParamSkip, ...] = () param_allows: tuple[ParamAllow, ...] = () param_overrides: tuple[ParamOverride, ...] = () + fixture_names: tuple[str, ...] = () @dataclass(frozen=True) diff --git a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py index 52c44bbf3..4a3cbbf63 100644 --- a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py +++ b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py @@ -34,6 +34,7 @@ from __future__ import annotations import fnmatch +import inspect import os import re import subprocess @@ -43,6 +44,7 @@ from pathlib import Path import pytest +from vllm.v1.attention.backends.registry import AttentionBackendEnum import yaml from vllm_spyre_next.testing.models import ( @@ -119,6 +121,7 @@ def _parse_config(raw_tests: dict) -> UpstreamTestConfig: param_skips=tuple(param_skips), param_allows=tuple(param_allows), param_overrides=tuple(param_overrides), + fixture_names=tuple(allow.get("fixture_names", ())), ) ) block_list = [BlockEntry(test=b["test"]) for b in file_entry.get("block_list", [])] @@ -325,6 +328,24 @@ def _prepare_upstream_tests_dir() -> Path: return tests_dir +def _spicy_code_edits(upstream_tests_dir: Path): + """Apply spicy code edits to the upstream tests directory. + + These should be _temporary_ edits to source code for vllm tests while we work to make them more + portable. This should only be used where mocking is not possible or too cumbersome. + """ + + # Mocking out torch.device seems impossible to do (at least multiple rounds of Bob and Claude + # were unsuccessful). So we patch the source code to change `torch.device("cuda:0")` to + # `torch.device("cpu")`. + hardcoded_cuda_test_path = upstream_tests_dir / "v1" / "attention" / "test_attention_backends.py" + with open(hardcoded_cuda_test_path, "r") as f: + content = f.read() + content = content.replace('torch.device("cuda:0")', 'torch.device("cpu")') + with open(hardcoded_cuda_test_path, "w") as f: + f.write(content) + + # --------------------------------------------------------------------------- # Pytest Hooks # --------------------------------------------------------------------------- @@ -365,6 +386,7 @@ def pytest_configure(config): try: # Clone vLLM to cache upstream_tests_base = _prepare_upstream_tests_dir() + _spicy_code_edits(upstream_tests_base) config._upstream_tests_base = upstream_tests_base # Determine which test paths to inject @@ -498,6 +520,10 @@ def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item elif allow_entry.mode == "xfail_strict": item.add_marker(pytest.mark.xfail(strict=True)) + # Inject fixtures for tests that have fixture_names defined + for fixture_name in allow_entry.fixture_names: + item.fixturenames.append(fixture_name) + # Reorder tests so that tests with "model" in the name run first _reorder_tests_by_name(items) @@ -596,6 +622,53 @@ def should_do_global_cleanup_after_test(): """Skip global cleanup for Spyre - torch.accelerator.empty_cache() doesn't work yet.""" return False +@pytest.fixture() +def patch_backend_list(request, monkeypatch): + """This fixture patches things for tests/v1/attention/test_attention_backends.py""" + + # The BACKENDS_TO_TEST list has to be patched with only our backend + our_backend_list = [ + AttentionBackendEnum.CUSTOM, + ] + test_module = request.node.module + monkeypatch.setattr(test_module, "BACKENDS_TO_TEST", our_backend_list) + + # the batch specs need to be modified to have only a single sequence, because we don't yet + # support batched attention + our_batch_specs = { + "small_decode": test_module.BatchSpec(seq_lens=[40], query_lens=[1]), + "small_prefill": test_module.BatchSpec(seq_lens=[40], query_lens=[8]), + "mixed_small": test_module.BatchSpec(seq_lens=[48], query_lens=[5]), + "medium_decode": test_module.BatchSpec( + seq_lens=[1024], + query_lens=[1], + ), + "medium_prefill": test_module.BatchSpec(seq_lens=[1024], query_lens=[16]), + "mixed_medium": test_module.BatchSpec(seq_lens=[2048], query_lens=[1]), + "large_decode": test_module.BatchSpec(seq_lens=[2048], query_lens=[1]), + "large_prefill": test_module.BatchSpec(seq_lens=[4096], query_lens=[32]), + "mixed_large": test_module.BatchSpec(seq_lens=[4096], query_lens=[32]), + "single_decode": test_module.BatchSpec(seq_lens=[1024], query_lens=[1]), + "single_prefill": test_module.BatchSpec(seq_lens=[1024], query_lens=[64]), + # encoder-only + "small_encoder_prefill": test_module.BatchSpec(seq_lens=[32], query_lens=[32]), + "medium_encoder_prefill": test_module.BatchSpec(seq_lens=[256], query_lens=[256]), + } + monkeypatch.setattr(test_module, "BATCH_SPECS", our_batch_specs) + + # _test_backend_correctness may be called with a hardcoded AttentionBackendEnum.FLASH_ATTN, + # which we want to ignore + orig_tbc = test_module._test_backend_correctness + def tbc_wrapper(batch_spec, model, backend_to_test: list[AttentionBackendEnum | str], *args, **kwargs): + if "AttentionBackendEnum.FLEX_ATTENTION" in str(backend_to_test): + print("skipping bad invocation of _test_backend_correctness for LARGE_BLOCK_BACKENDS") + return + print("running tbc for ", backend_to_test) + return orig_tbc(batch_spec, model, backend_to_test, *args, **kwargs) + monkeypatch.setattr(test_module, "_test_backend_correctness", tbc_wrapper) + + yield + @pytest.hookimpl(tryfirst=True) def pytest_fixture_setup(fixturedef, request): diff --git a/vllm_spyre_next/vllm_spyre_next/testing/upstream_tests.yaml b/vllm_spyre_next/vllm_spyre_next/testing/upstream_tests.yaml index d273502c7..3ee1f7737 100644 --- a/vllm_spyre_next/vllm_spyre_next/testing/upstream_tests.yaml +++ b/vllm_spyre_next/vllm_spyre_next/testing/upstream_tests.yaml @@ -1,7 +1,7 @@ # Upstream test filter configuration for vllm-spyre-next. # # Only tests listed here will run from upstream vLLM. All other upstream -# tests are skipped by default (opt-in / whitelist model). +# tests are skipped by default (opt-in / allowlist model). # # block_list entries take precedence over allow_list entries. # @@ -14,6 +14,9 @@ # Parameter name -> list of values to skip # allow_list[].params.override # Parameter name -> replacement values (replaces upstream defaults) +# allow_list[].fixture_names Fixture names to inject for this test. +# These fixtures are automatically added to the test's +# fixturenames during collection. # block_list[].test fnmatch glob matched against test function name tests: @@ -42,8 +45,18 @@ tests: tags: [facebook, upstream, uses_subprocess] params: allow: # skip every model except facebook/opt-125m - model: + model: - facebook/opt-125m - block_list: - - test: "test_fused_rms_norm_quant" + - rel_path: tests/v1/attention/test_attention_backends.py + allow_list: + - test: "test_causal_backend_correctness" + mode: mandatory_pass + tags: [attention, upstream] + params: + allow: # skip TP cases that we don't support + tensor_parallel_size: + - 1 + fixture_names: + - "patch_backend_list" + From 2a73e5bc71527104a65abe853bf8f838831d5e44 Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Wed, 1 Apr 2026 08:31:32 -0600 Subject: [PATCH 2/4] :art: fmt Signed-off-by: Joe Runde --- .../vllm_spyre_next/testing/pytest_plugin.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py index 4a3cbbf63..4143d3135 100644 --- a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py +++ b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py @@ -34,7 +34,6 @@ from __future__ import annotations import fnmatch -import inspect import os import re import subprocess @@ -330,16 +329,18 @@ def _prepare_upstream_tests_dir() -> Path: def _spicy_code_edits(upstream_tests_dir: Path): """Apply spicy code edits to the upstream tests directory. - + These should be _temporary_ edits to source code for vllm tests while we work to make them more portable. This should only be used where mocking is not possible or too cumbersome. """ # Mocking out torch.device seems impossible to do (at least multiple rounds of Bob and Claude - # were unsuccessful). So we patch the source code to change `torch.device("cuda:0")` to - # `torch.device("cpu")`. - hardcoded_cuda_test_path = upstream_tests_dir / "v1" / "attention" / "test_attention_backends.py" - with open(hardcoded_cuda_test_path, "r") as f: + # were unsuccessful). So we patch the source code to change the hardcoded + # `torch.device("cuda:0")` to `torch.device("cpu")`. + hardcoded_cuda_test_path = ( + upstream_tests_dir / "v1" / "attention" / "test_attention_backends.py" + ) + with open(hardcoded_cuda_test_path) as f: content = f.read() content = content.replace('torch.device("cuda:0")', 'torch.device("cpu")') with open(hardcoded_cuda_test_path, "w") as f: @@ -622,6 +623,7 @@ def should_do_global_cleanup_after_test(): """Skip global cleanup for Spyre - torch.accelerator.empty_cache() doesn't work yet.""" return False + @pytest.fixture() def patch_backend_list(request, monkeypatch): """This fixture patches things for tests/v1/attention/test_attention_backends.py""" @@ -656,19 +658,21 @@ def patch_backend_list(request, monkeypatch): } monkeypatch.setattr(test_module, "BATCH_SPECS", our_batch_specs) - # _test_backend_correctness may be called with a hardcoded AttentionBackendEnum.FLASH_ATTN, + # _test_backend_correctness may be called with a hardcoded AttentionBackendEnum.FLASH_ATTN, # which we want to ignore orig_tbc = test_module._test_backend_correctness - def tbc_wrapper(batch_spec, model, backend_to_test: list[AttentionBackendEnum | str], *args, **kwargs): + + def tbc_wrapper( + batch_spec, model, backend_to_test: list[AttentionBackendEnum | str], *args, **kwargs + ): if "AttentionBackendEnum.FLEX_ATTENTION" in str(backend_to_test): - print("skipping bad invocation of _test_backend_correctness for LARGE_BLOCK_BACKENDS") return - print("running tbc for ", backend_to_test) return orig_tbc(batch_spec, model, backend_to_test, *args, **kwargs) + monkeypatch.setattr(test_module, "_test_backend_correctness", tbc_wrapper) yield - + @pytest.hookimpl(tryfirst=True) def pytest_fixture_setup(fixturedef, request): From a2a3bba458130c12719fcf37a5a3fe53fbabbeb6 Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Mon, 6 Apr 2026 15:18:11 -0600 Subject: [PATCH 3/4] :fire: remove batch size 1 constraint Signed-off-by: Joe Runde --- .../vllm_spyre_next/testing/pytest_plugin.py | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py index 4143d3135..2851dc161 100644 --- a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py +++ b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py @@ -635,29 +635,6 @@ def patch_backend_list(request, monkeypatch): test_module = request.node.module monkeypatch.setattr(test_module, "BACKENDS_TO_TEST", our_backend_list) - # the batch specs need to be modified to have only a single sequence, because we don't yet - # support batched attention - our_batch_specs = { - "small_decode": test_module.BatchSpec(seq_lens=[40], query_lens=[1]), - "small_prefill": test_module.BatchSpec(seq_lens=[40], query_lens=[8]), - "mixed_small": test_module.BatchSpec(seq_lens=[48], query_lens=[5]), - "medium_decode": test_module.BatchSpec( - seq_lens=[1024], - query_lens=[1], - ), - "medium_prefill": test_module.BatchSpec(seq_lens=[1024], query_lens=[16]), - "mixed_medium": test_module.BatchSpec(seq_lens=[2048], query_lens=[1]), - "large_decode": test_module.BatchSpec(seq_lens=[2048], query_lens=[1]), - "large_prefill": test_module.BatchSpec(seq_lens=[4096], query_lens=[32]), - "mixed_large": test_module.BatchSpec(seq_lens=[4096], query_lens=[32]), - "single_decode": test_module.BatchSpec(seq_lens=[1024], query_lens=[1]), - "single_prefill": test_module.BatchSpec(seq_lens=[1024], query_lens=[64]), - # encoder-only - "small_encoder_prefill": test_module.BatchSpec(seq_lens=[32], query_lens=[32]), - "medium_encoder_prefill": test_module.BatchSpec(seq_lens=[256], query_lens=[256]), - } - monkeypatch.setattr(test_module, "BATCH_SPECS", our_batch_specs) - # _test_backend_correctness may be called with a hardcoded AttentionBackendEnum.FLASH_ATTN, # which we want to ignore orig_tbc = test_module._test_backend_correctness From b96fa555bef293af0338cbfdd3dfd1806d40572b Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Mon, 6 Apr 2026 15:25:45 -0600 Subject: [PATCH 4/4] :recycle: spicy -> temp Signed-off-by: Joe Runde --- vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py index 2851dc161..8cb0f8140 100644 --- a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py +++ b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py @@ -327,8 +327,8 @@ def _prepare_upstream_tests_dir() -> Path: return tests_dir -def _spicy_code_edits(upstream_tests_dir: Path): - """Apply spicy code edits to the upstream tests directory. +def _temp_upstream_code_edits(upstream_tests_dir: Path): + """Apply small code edits to the upstream tests directory before importing. These should be _temporary_ edits to source code for vllm tests while we work to make them more portable. This should only be used where mocking is not possible or too cumbersome. @@ -387,7 +387,7 @@ def pytest_configure(config): try: # Clone vLLM to cache upstream_tests_base = _prepare_upstream_tests_dir() - _spicy_code_edits(upstream_tests_base) + _temp_upstream_code_edits(upstream_tests_base) config._upstream_tests_base = upstream_tests_base # Determine which test paths to inject