From 16f8d487b74f797a1f2a2a7ea965ab1f3e04de20 Mon Sep 17 00:00:00 2001
From: Joe Runde <joe@joerun.de>
Date: Tue, 31 Mar 2026 20:35:54 -0600
Subject: [PATCH 1/4] :alembic: add fixture to patch attention backends test

Signed-off-by: Joe Runde <joe@joerun.de>
---
 .../vllm_spyre_next/testing/models.py         |  3 +
 .../vllm_spyre_next/testing/pytest_plugin.py  | 73 +++++++++++++++++++
 .../testing/upstream_tests.yaml               | 21 +++++-
 3 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/vllm_spyre_next/vllm_spyre_next/testing/models.py b/vllm_spyre_next/vllm_spyre_next/testing/models.py
index 07581cb23..8c154cd7f 100644
--- a/vllm_spyre_next/vllm_spyre_next/testing/models.py
+++ b/vllm_spyre_next/vllm_spyre_next/testing/models.py
@@ -51,6 +51,8 @@ class AllowEntry:
         param_allows:    Parameter combinations to allow (whitelist). If specified,
                          only these parameter values will run.
         param_overrides: Parameter values to replace upstream defaults with.
+        fixture_names:   Fixture names to inject for this test (e.g. "foo" for a
+                         custom fixture that prints "hello world").
     """
 
     test: str
@@ -59,6 +61,7 @@ class AllowEntry:
     param_skips: tuple[ParamSkip, ...] = ()
     param_allows: tuple[ParamAllow, ...] = ()
     param_overrides: tuple[ParamOverride, ...] = ()
+    fixture_names: tuple[str, ...] = ()
 
 
 @dataclass(frozen=True)
diff --git a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py
index 52c44bbf3..4a3cbbf63 100644
--- a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py
+++ b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py
@@ -34,6 +34,7 @@
 from __future__ import annotations
 
 import fnmatch
+import inspect
 import os
 import re
 import subprocess
@@ -43,6 +44,7 @@
 from pathlib import Path
 
 import pytest
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 import yaml
 
 from vllm_spyre_next.testing.models import (
@@ -119,6 +121,7 @@ def _parse_config(raw_tests: dict) -> UpstreamTestConfig:
                     param_skips=tuple(param_skips),
                     param_allows=tuple(param_allows),
                     param_overrides=tuple(param_overrides),
+                    fixture_names=tuple(allow.get("fixture_names", ())),
                 )
             )
         block_list = [BlockEntry(test=b["test"]) for b in file_entry.get("block_list", [])]
@@ -325,6 +328,24 @@ def _prepare_upstream_tests_dir() -> Path:
     return tests_dir
 
 
+def _spicy_code_edits(upstream_tests_dir: Path):
+    """Apply spicy code edits to the upstream tests directory.
+    
+    These should be _temporary_ edits to source code for vllm tests while we work to make them more
+    portable. This should only be used where mocking is not possible or too cumbersome.
+    """
+
+    # Mocking out torch.device seems impossible to do (at least multiple rounds of Bob and Claude
+    # were unsuccessful). So we patch the source code to change `torch.device("cuda:0")` to 
+    # `torch.device("cpu")`.
+    hardcoded_cuda_test_path = upstream_tests_dir / "v1" / "attention" / "test_attention_backends.py"
+    with open(hardcoded_cuda_test_path, "r") as f:
+        content = f.read()
+    content = content.replace('torch.device("cuda:0")', 'torch.device("cpu")')
+    with open(hardcoded_cuda_test_path, "w") as f:
+        f.write(content)
+
+
 # ---------------------------------------------------------------------------
 # Pytest Hooks
 # ---------------------------------------------------------------------------
@@ -365,6 +386,7 @@ def pytest_configure(config):
         try:
             # Clone vLLM to cache
             upstream_tests_base = _prepare_upstream_tests_dir()
+            _spicy_code_edits(upstream_tests_base)
             config._upstream_tests_base = upstream_tests_base
 
             # Determine which test paths to inject
@@ -498,6 +520,10 @@ def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item
         elif allow_entry.mode == "xfail_strict":
             item.add_marker(pytest.mark.xfail(strict=True))
 
+        # Inject fixtures for tests that have fixture_names defined
+        for fixture_name in allow_entry.fixture_names:
+            item.fixturenames.append(fixture_name)
+
     # Reorder tests so that tests with "model" in the name run first
     _reorder_tests_by_name(items)
 
@@ -596,6 +622,53 @@ def should_do_global_cleanup_after_test():
     """Skip global cleanup for Spyre - torch.accelerator.empty_cache() doesn't work yet."""
     return False
 
+@pytest.fixture()
+def patch_backend_list(request, monkeypatch):
+    """This fixture patches things for tests/v1/attention/test_attention_backends.py"""
+
+    # The BACKENDS_TO_TEST list has to be patched with only our backend
+    our_backend_list = [
+        AttentionBackendEnum.CUSTOM,
+    ]
+    test_module = request.node.module
+    monkeypatch.setattr(test_module, "BACKENDS_TO_TEST", our_backend_list)
+
+    # the batch specs need to be modified to have only a single sequence, because we don't yet
+    # support batched attention
+    our_batch_specs = {
+        "small_decode": test_module.BatchSpec(seq_lens=[40], query_lens=[1]),
+        "small_prefill": test_module.BatchSpec(seq_lens=[40], query_lens=[8]),
+        "mixed_small": test_module.BatchSpec(seq_lens=[48], query_lens=[5]),
+        "medium_decode": test_module.BatchSpec(
+            seq_lens=[1024],
+            query_lens=[1],
+        ),
+        "medium_prefill": test_module.BatchSpec(seq_lens=[1024], query_lens=[16]),
+        "mixed_medium": test_module.BatchSpec(seq_lens=[2048], query_lens=[1]),
+        "large_decode": test_module.BatchSpec(seq_lens=[2048], query_lens=[1]),
+        "large_prefill": test_module.BatchSpec(seq_lens=[4096], query_lens=[32]),
+        "mixed_large": test_module.BatchSpec(seq_lens=[4096], query_lens=[32]),
+        "single_decode": test_module.BatchSpec(seq_lens=[1024], query_lens=[1]),
+        "single_prefill": test_module.BatchSpec(seq_lens=[1024], query_lens=[64]),
+        # encoder-only
+        "small_encoder_prefill": test_module.BatchSpec(seq_lens=[32], query_lens=[32]),
+        "medium_encoder_prefill": test_module.BatchSpec(seq_lens=[256], query_lens=[256]),
+    }
+    monkeypatch.setattr(test_module, "BATCH_SPECS", our_batch_specs)
+
+    # _test_backend_correctness may be called with a hardcoded AttentionBackendEnum.FLASH_ATTN, 
+    # which we want to ignore
+    orig_tbc = test_module._test_backend_correctness
+    def tbc_wrapper(batch_spec, model, backend_to_test: list[AttentionBackendEnum | str], *args, **kwargs):
+        if "AttentionBackendEnum.FLEX_ATTENTION" in str(backend_to_test):
+            print("skipping bad invocation of _test_backend_correctness for LARGE_BLOCK_BACKENDS")
+            return
+        print("running tbc for ", backend_to_test)
+        return orig_tbc(batch_spec, model, backend_to_test, *args, **kwargs)
+    monkeypatch.setattr(test_module, "_test_backend_correctness", tbc_wrapper)
+
+    yield
+    
 
 @pytest.hookimpl(tryfirst=True)
 def pytest_fixture_setup(fixturedef, request):
diff --git a/vllm_spyre_next/vllm_spyre_next/testing/upstream_tests.yaml b/vllm_spyre_next/vllm_spyre_next/testing/upstream_tests.yaml
index d273502c7..3ee1f7737 100644
--- a/vllm_spyre_next/vllm_spyre_next/testing/upstream_tests.yaml
+++ b/vllm_spyre_next/vllm_spyre_next/testing/upstream_tests.yaml
@@ -1,7 +1,7 @@
 # Upstream test filter configuration for vllm-spyre-next.
 #
 # Only tests listed here will run from upstream vLLM. All other upstream
-# tests are skipped by default (opt-in / whitelist model).
+# tests are skipped by default (opt-in / allowlist model).
 #
 # block_list entries take precedence over allow_list entries.
 #
@@ -14,6 +14,9 @@
 #                     Parameter name -> list of values to skip
 #   allow_list[].params.override
 #                     Parameter name -> replacement values (replaces upstream defaults)
+#   allow_list[].fixture_names Fixture names to inject for this test.
+#                              These fixtures are automatically added to the test's
+#                              fixturenames during collection.
 #   block_list[].test fnmatch glob matched against test function name
 
 tests:
@@ -42,8 +45,18 @@ tests:
           tags: [facebook, upstream, uses_subprocess]
           params:
             allow:  # skip every model except facebook/opt-125m
-              model: 
+              model:
               - facebook/opt-125m
-      block_list:
-        - test: "test_fused_rms_norm_quant"
+    - rel_path: tests/v1/attention/test_attention_backends.py
+      allow_list:
+        - test: "test_causal_backend_correctness"
+          mode: mandatory_pass
+          tags: [attention, upstream]
+          params:
+            allow:  # skip TP cases that we don't support
+              tensor_parallel_size:
+              - 1
+          fixture_names:
+          - "patch_backend_list"
+
 

From 2a73e5bc71527104a65abe853bf8f838831d5e44 Mon Sep 17 00:00:00 2001
From: Joe Runde <joe@joerun.de>
Date: Wed, 1 Apr 2026 08:31:32 -0600
Subject: [PATCH 2/4] :art: fmt

Signed-off-by: Joe Runde <joe@joerun.de>
---
 .../vllm_spyre_next/testing/pytest_plugin.py  | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py
index 4a3cbbf63..4143d3135 100644
--- a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py
+++ b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py
@@ -34,7 +34,6 @@
 from __future__ import annotations
 
 import fnmatch
-import inspect
 import os
 import re
 import subprocess
@@ -330,16 +329,18 @@ def _prepare_upstream_tests_dir() -> Path:
 
 def _spicy_code_edits(upstream_tests_dir: Path):
     """Apply spicy code edits to the upstream tests directory.
-    
+
     These should be _temporary_ edits to source code for vllm tests while we work to make them more
     portable. This should only be used where mocking is not possible or too cumbersome.
     """
 
     # Mocking out torch.device seems impossible to do (at least multiple rounds of Bob and Claude
-    # were unsuccessful). So we patch the source code to change `torch.device("cuda:0")` to 
-    # `torch.device("cpu")`.
-    hardcoded_cuda_test_path = upstream_tests_dir / "v1" / "attention" / "test_attention_backends.py"
-    with open(hardcoded_cuda_test_path, "r") as f:
+    # were unsuccessful). So we patch the source code to change the hardcoded
+    # `torch.device("cuda:0")` to `torch.device("cpu")`.
+    hardcoded_cuda_test_path = (
+        upstream_tests_dir / "v1" / "attention" / "test_attention_backends.py"
+    )
+    with open(hardcoded_cuda_test_path) as f:
         content = f.read()
     content = content.replace('torch.device("cuda:0")', 'torch.device("cpu")')
     with open(hardcoded_cuda_test_path, "w") as f:
@@ -622,6 +623,7 @@ def should_do_global_cleanup_after_test():
     """Skip global cleanup for Spyre - torch.accelerator.empty_cache() doesn't work yet."""
     return False
 
+
 @pytest.fixture()
 def patch_backend_list(request, monkeypatch):
     """This fixture patches things for tests/v1/attention/test_attention_backends.py"""
@@ -656,19 +658,21 @@ def patch_backend_list(request, monkeypatch):
     }
     monkeypatch.setattr(test_module, "BATCH_SPECS", our_batch_specs)
 
-    # _test_backend_correctness may be called with a hardcoded AttentionBackendEnum.FLASH_ATTN, 
+    # _test_backend_correctness may be called with a hardcoded AttentionBackendEnum.FLASH_ATTN,
     # which we want to ignore
     orig_tbc = test_module._test_backend_correctness
-    def tbc_wrapper(batch_spec, model, backend_to_test: list[AttentionBackendEnum | str], *args, **kwargs):
+
+    def tbc_wrapper(
+        batch_spec, model, backend_to_test: list[AttentionBackendEnum | str], *args, **kwargs
+    ):
         if "AttentionBackendEnum.FLEX_ATTENTION" in str(backend_to_test):
-            print("skipping bad invocation of _test_backend_correctness for LARGE_BLOCK_BACKENDS")
             return
-        print("running tbc for ", backend_to_test)
         return orig_tbc(batch_spec, model, backend_to_test, *args, **kwargs)
+
     monkeypatch.setattr(test_module, "_test_backend_correctness", tbc_wrapper)
 
     yield
-    
+
 
 @pytest.hookimpl(tryfirst=True)
 def pytest_fixture_setup(fixturedef, request):

From a2a3bba458130c12719fcf37a5a3fe53fbabbeb6 Mon Sep 17 00:00:00 2001
From: Joe Runde <joe@joerun.de>
Date: Mon, 6 Apr 2026 15:18:11 -0600
Subject: [PATCH 3/4] :fire: remove batch size 1 constraint

Signed-off-by: Joe Runde <joe@joerun.de>
---
 .../vllm_spyre_next/testing/pytest_plugin.py  | 23 -------------------
 1 file changed, 23 deletions(-)

diff --git a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py
index 4143d3135..2851dc161 100644
--- a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py
+++ b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py
@@ -635,29 +635,6 @@ def patch_backend_list(request, monkeypatch):
     test_module = request.node.module
     monkeypatch.setattr(test_module, "BACKENDS_TO_TEST", our_backend_list)
 
-    # the batch specs need to be modified to have only a single sequence, because we don't yet
-    # support batched attention
-    our_batch_specs = {
-        "small_decode": test_module.BatchSpec(seq_lens=[40], query_lens=[1]),
-        "small_prefill": test_module.BatchSpec(seq_lens=[40], query_lens=[8]),
-        "mixed_small": test_module.BatchSpec(seq_lens=[48], query_lens=[5]),
-        "medium_decode": test_module.BatchSpec(
-            seq_lens=[1024],
-            query_lens=[1],
-        ),
-        "medium_prefill": test_module.BatchSpec(seq_lens=[1024], query_lens=[16]),
-        "mixed_medium": test_module.BatchSpec(seq_lens=[2048], query_lens=[1]),
-        "large_decode": test_module.BatchSpec(seq_lens=[2048], query_lens=[1]),
-        "large_prefill": test_module.BatchSpec(seq_lens=[4096], query_lens=[32]),
-        "mixed_large": test_module.BatchSpec(seq_lens=[4096], query_lens=[32]),
-        "single_decode": test_module.BatchSpec(seq_lens=[1024], query_lens=[1]),
-        "single_prefill": test_module.BatchSpec(seq_lens=[1024], query_lens=[64]),
-        # encoder-only
-        "small_encoder_prefill": test_module.BatchSpec(seq_lens=[32], query_lens=[32]),
-        "medium_encoder_prefill": test_module.BatchSpec(seq_lens=[256], query_lens=[256]),
-    }
-    monkeypatch.setattr(test_module, "BATCH_SPECS", our_batch_specs)
-
     # _test_backend_correctness may be called with a hardcoded AttentionBackendEnum.FLASH_ATTN,
     # which we want to ignore
     orig_tbc = test_module._test_backend_correctness

From b96fa555bef293af0338cbfdd3dfd1806d40572b Mon Sep 17 00:00:00 2001
From: Joe Runde <joe@joerun.de>
Date: Mon, 6 Apr 2026 15:25:45 -0600
Subject: [PATCH 4/4] :recycle: spicy -> temp

Signed-off-by: Joe Runde <joe@joerun.de>
---
 vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py
index 2851dc161..8cb0f8140 100644
--- a/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py
+++ b/vllm_spyre_next/vllm_spyre_next/testing/pytest_plugin.py
@@ -327,8 +327,8 @@ def _prepare_upstream_tests_dir() -> Path:
     return tests_dir
 
 
-def _spicy_code_edits(upstream_tests_dir: Path):
-    """Apply spicy code edits to the upstream tests directory.
+def _temp_upstream_code_edits(upstream_tests_dir: Path):
+    """Apply small code edits to the upstream tests directory before importing.
 
     These should be _temporary_ edits to source code for vllm tests while we work to make them more
     portable. This should only be used where mocking is not possible or too cumbersome.
@@ -387,7 +387,7 @@ def pytest_configure(config):
         try:
             # Clone vLLM to cache
             upstream_tests_base = _prepare_upstream_tests_dir()
-            _spicy_code_edits(upstream_tests_base)
+            _temp_upstream_code_edits(upstream_tests_base)
             config._upstream_tests_base = upstream_tests_base
 
             # Determine which test paths to inject