From 5207f29d43a43acdd3766704a05046b469d5eb67 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Wed, 25 Feb 2026 00:18:13 -0600
Subject: [PATCH 1/4] [ROCm][CI] Extending attention backend coverage for Eagle
 spec decode tests

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 tests/v1/e2e/test_spec_decode.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index a141e9da08a1..52a02beb6363 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -469,12 +469,17 @@ def test_eagle_correctness(
     # Determine attention config
     # Scout requires default backend selection because vision encoder has
     # head_dim 88 being incompatible with FLASH_ATTN and needs to fall back
-    # to Flex Attn
-    if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
-        if current_platform.is_rocm():
-            # TODO: Enable Flex Attn for spec_decode on ROCm
-            pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
-        attention_config = None  # Let it fall back to default
+    # to FLEX_ATTENTION
+    if (
+        "Llama-4-Scout" in model_setup[1]
+        and attn_backend == "FLASH_ATTN"
+        and current_platform.is_rocm()
+    ):
+        print(
+            "FLASH_ATTN for spec_decode not supported on "
+            "ROCm currently. Changing to FLEX_ATTENTION backend."
+        )
+        attention_config = {"backend": "FLEX_ATTENTION"}
     else:
         attention_config = {"backend": attn_backend}
 
@@ -489,7 +494,9 @@ def test_eagle_correctness(
 
         if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
             if "deepseek" in model_setup[1].lower():
-                pytest.skip("ROCM_AITER_FA for deepseek not supported on ROCm platform")
+                m.setenv("VLLM_ROCM_USE_AITER", "1")
+                m.delenv("VLLM_MLA_DISABLE", raising=False)
+                attention_config = {"backend": "TRITON_MLA"}
             else:
                 m.setenv("VLLM_ROCM_USE_AITER", "1")
 

From 050936bdf56d473c659b104a4e7caba524cdaa41 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Wed, 25 Feb 2026 00:25:41 -0600
Subject: [PATCH 2/4] Fix Llama-4-Scout FLASH_ATTN handling: use FLEX_ATTENTION
 on ROCm, preserve None fallback on NVIDIA

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 tests/v1/e2e/test_spec_decode.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 52a02beb6363..68e3b2a64eb2 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -470,16 +470,15 @@ def test_eagle_correctness(
     # Scout requires default backend selection because vision encoder has
     # head_dim 88 being incompatible with FLASH_ATTN and needs to fall back
     # to FLEX_ATTENTION
-    if (
-        "Llama-4-Scout" in model_setup[1]
-        and attn_backend == "FLASH_ATTN"
-        and current_platform.is_rocm()
-    ):
-        print(
-            "FLASH_ATTN for spec_decode not supported on "
-            "ROCm currently. Changing to FLEX_ATTENTION backend."
-        )
-        attention_config = {"backend": "FLEX_ATTENTION"}
+    if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
+        if current_platform.is_rocm():
+            print(
+                "FLASH_ATTN for spec_decode not supported on "
+                "ROCm currently. Changing to FLEX_ATTENTION backend."
+            )
+            attention_config = {"backend": "FLEX_ATTENTION"}
+        else:
+            attention_config = None
     else:
         attention_config = {"backend": attn_backend}
 

From c7d160c9787b22d98e00e7de28bce97473da9b30 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Wed, 25 Feb 2026 11:59:05 -0600
Subject: [PATCH 3/4] [ROCm][CI] Resolving extended v1 e2e test queue time

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/test_areas/engine.yaml     |   2 +-
 tests/utils.py                        |  51 ++++++
 tests/v1/e2e/test_async_scheduling.py |   4 +
 tests/v1/e2e/test_spec_decode.py      | 223 +++++++++++++++++++-------
 4 files changed, 219 insertions(+), 61 deletions(-)

diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
index 4f2380592d9e..19cd91370e64 100644
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -30,7 +30,7 @@ steps:
     - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
   mirror:
     amd:
-      device: mi325_8
+      device: mi325_1
       depends_on:
       - image-build-amd
       commands:
diff --git a/tests/utils.py b/tests/utils.py
index 75d33e509528..4041c261788c 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1327,6 +1327,57 @@ def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
     return wrapper
 
 
+def gpu_tier_mark(*, min_gpus: int = 1, max_gpus: int | None = None):
+    """
+    Mark a test to only run when the GPU count falls within [min_gpus, max_gpus].
+
+    Examples:
+        @gpu_tier_mark(min_gpus=2)          # only on multi-GPU
+        @gpu_tier_mark(max_gpus=1)          # only on single-GPU
+        @gpu_tier_mark(min_gpus=2, max_gpus=4)  # 2-4 GPUs only
+    """
+    gpu_count = cuda_device_count_stateless()
+    marks = []
+
+    if min_gpus > 1:
+        marks.append(pytest.mark.distributed(num_gpus=min_gpus))
+
+    reasons = []
+    if gpu_count < min_gpus:
+        reasons.append(f"Need at least {min_gpus} GPUs (have {gpu_count})")
+    if max_gpus is not None and gpu_count > max_gpus:
+        reasons.append(f"Need at most {max_gpus} GPUs (have {gpu_count})")
+
+    if reasons:
+        marks.append(pytest.mark.skipif(True, reason="; ".join(reasons)))
+
+    return marks
+
+
+def single_gpu_only(f=None):
+    """Skip this test when running in a multi-GPU environment."""
+    marks = gpu_tier_mark(max_gpus=1)
+
+    def wrapper(func):
+        for mark in reversed(marks):
+            func = mark(func)
+        return func
+
+    return wrapper(f) if f is not None else wrapper
+
+
+def multi_gpu_only(*, num_gpus: int = 2):
+    """Skip this test when running on fewer than num_gpus GPUs."""
+    marks = gpu_tier_mark(min_gpus=num_gpus)
+
+    def wrapper(f):
+        for mark in reversed(marks):
+            f = mark(f)
+        return f
+
+    return wrapper
+
+
 async def completions_with_server_args(
     prompts: list[str],
     model_name: str,
diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index b85f8880cf8e..393c8dbeecfe 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -6,6 +6,7 @@
 import pytest
 import torch._dynamo.config as dynamo_config
 
+from tests.utils import large_gpu_mark, single_gpu_only
 from vllm import SamplingParams
 from vllm.logprobs import Logprob
 from vllm.platforms import current_platform
@@ -36,6 +37,7 @@
 )
 
 
+@single_gpu_only
 def test_without_spec_decoding(
     sample_json_schema,
     monkeypatch: pytest.MonkeyPatch,
@@ -95,6 +97,8 @@ def test_without_spec_decoding(
     run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
 
 
+@single_gpu_only
+@large_gpu_mark(min_gb=16)
 def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
     """Test consistency and acceptance rates with some different combos of
     preemption, executor, async scheduling, prefill chunking,
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index bf6638539fa3..c6327e603aef 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -9,7 +9,13 @@
 import torch
 
 from tests.evals.gsm8k.gsm8k_eval import _build_gsm8k_prompts, evaluate_gsm8k_offline
-from tests.utils import get_attn_backend_list_based_on_platform, large_gpu_mark
+from tests.utils import (
+    get_attn_backend_list_based_on_platform,
+    large_gpu_mark,
+    multi_gpu_marks,
+    multi_gpu_only,
+    single_gpu_only,
+)
 from vllm import LLM, SamplingParams
 from vllm.assets.base import VLLM_S3_BUCKET_URL
 from vllm.assets.image import VLM_IMAGES_DIR
@@ -160,6 +166,8 @@ def reset_torch_dynamo():
         },
     ],
 )
+@single_gpu_only
+@large_gpu_mark(min_gb=20)
 def test_ngram_and_suffix_correctness(
     speculative_config: dict,
     model_name: str,
@@ -175,6 +183,8 @@ def test_ngram_and_suffix_correctness(
     cleanup_dist_env_and_memory()
 
 
+@single_gpu_only
+@large_gpu_mark(min_gb=20)
 def test_suffix_decoding_acceptance(
     monkeypatch: pytest.MonkeyPatch,
     sampling_config: SamplingParams,
@@ -242,6 +252,8 @@ def test_suffix_decoding_acceptance(
     ],
     ids=["llama3_eagle3_speculator", "qwen3_eagle3_speculator"],
 )
+@single_gpu_only
+@large_gpu_mark(min_gb=24)
 def test_speculators_model_integration(
     monkeypatch: pytest.MonkeyPatch,
     sampling_config: SamplingParams,
@@ -319,6 +331,56 @@ def test_speculators_model_integration(
     )
 
 
+@single_gpu_only
+@pytest.mark.parametrize(
+    [
+        "model_setup",
+        "mm_enabled",
+        "enable_chunked_prefill",
+        "model_impl",
+        "expected_accuracy_threshold",
+    ],
+    [
+        (
+            (
+                "eagle",
+                "eagle618/deepseek-v3-random",
+                "eagle618/eagle-deepseek-v3-random",
+                1,
+            ),
+            False,
+            False,
+            "auto",
+            0.0,
+        ),
+    ],
+    ids=["deepseek_eagle"],
+)
+@pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
+def test_eagle_correctness_light(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_setup: tuple[str, str, str, int],
+    mm_enabled: bool,
+    expected_accuracy_threshold: float,
+    enable_chunked_prefill: bool,
+    model_impl: str,
+    attn_backend: str,
+):
+    _run_eagle_correctness(
+        monkeypatch,
+        sampling_config,
+        model_setup,
+        mm_enabled,
+        expected_accuracy_threshold,
+        enable_chunked_prefill,
+        model_impl,
+        attn_backend,
+    )
+
+
+@single_gpu_only
+@large_gpu_mark(min_gb=24)
 @pytest.mark.parametrize(
     [
         "model_setup",
@@ -333,14 +395,14 @@ def test_speculators_model_integration(
             False,
             False,
             "auto",
-            0.8,  # ref: 90%
+            0.8,
         ),
         (
             ("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1),
             False,
             False,
             "transformers",
-            0.8,  # ref: 90%
+            0.8,
         ),
         pytest.param(
             (
@@ -352,7 +414,7 @@ def test_speculators_model_integration(
             False,
             False,
             "auto",
-            0.8,  # ref: 90%
+            0.8,
             marks=pytest.mark.skip(
                 reason="architecture of its eagle3 is LlamaForCausalLMEagle3"
             ),
@@ -367,35 +429,77 @@ def test_speculators_model_integration(
             False,
             False,
             "auto",
-            0.7,  # TODO, update this with a reference value when re-enabling this case
+            0.7,
             marks=pytest.mark.skip(
-                reason="Skipping due to its head_dim not being a a multiple of 32"
+                reason="Skipping due to its head_dim not being a multiple of 32"
             ),
         ),
-        pytest.param(
+        (
             (
-                "eagle",
+                "eagle3",
                 "meta-llama/Llama-3.1-8B-Instruct",
-                "yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
+                "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
                 1,
             ),
             False,
-            True,
+            False,
             "auto",
-            0.7,  # ref: 75%-80%
-            marks=large_gpu_mark(min_gb=40),
-        ),  # works on 4x H100
-        (
+            0.7,
+        ),
+    ],
+    ids=[
+        "qwen3_eagle3",
+        "qwen3_eagle3-transformers",
+        "qwen3_vl_eagle3",
+        "qwen2_5_vl_eagle3",
+        "llama3_eagle3",
+    ],
+)
+@pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
+def test_eagle_correctness_medium(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_setup: tuple[str, str, str, int],
+    mm_enabled: bool,
+    expected_accuracy_threshold: float,
+    enable_chunked_prefill: bool,
+    model_impl: str,
+    attn_backend: str,
+):
+    _run_eagle_correctness(
+        monkeypatch,
+        sampling_config,
+        model_setup,
+        mm_enabled,
+        expected_accuracy_threshold,
+        enable_chunked_prefill,
+        model_impl,
+        attn_backend,
+    )
+
+
+@pytest.mark.parametrize(
+    [
+        "model_setup",
+        "mm_enabled",
+        "enable_chunked_prefill",
+        "model_impl",
+        "expected_accuracy_threshold",
+    ],
+    [
+        pytest.param(
             (
-                "eagle3",
+                "eagle",
                 "meta-llama/Llama-3.1-8B-Instruct",
-                "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
+                "yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
                 1,
             ),
             False,
-            False,
+            True,
             "auto",
-            0.7,  # ref: 75%-80%
+            0.7,
+            marks=large_gpu_mark(min_gb=40),
+            id="llama3_eagle",
         ),
         pytest.param(
             (
@@ -407,9 +511,10 @@ def test_speculators_model_integration(
             False,
             False,
             "auto",
-            0.8,  # ref: 90%
-            # marks=large_gpu_mark(min_gb=80),
-        ),  # works on 4x H100
+            0.8,
+            marks=multi_gpu_marks(num_gpus=4),
+            id="llama4_eagle",
+        ),
         pytest.param(
             (
                 "eagle",
@@ -420,36 +525,36 @@ def test_speculators_model_integration(
             True,
             True,
             "auto",
-            0.8,  # ref: 90%
-            marks=large_gpu_mark(min_gb=80),
-        ),  # works on 4x H100
-        (
-            (
-                "eagle",
-                "eagle618/deepseek-v3-random",
-                "eagle618/eagle-deepseek-v3-random",
-                1,
-            ),
-            False,
-            False,
-            "auto",
-            0.0,  # dummy model, skip gsm8k check
+            0.8,
+            marks=[*multi_gpu_marks(num_gpus=4), large_gpu_mark(min_gb=80)],
+            id="llama4_eagle_mm",
         ),
     ],
-    ids=[
-        "qwen3_eagle3",
-        "qwen3_eagle3-transformers",
-        "qwen3_vl_eagle3",
-        "qwen2_5_vl_eagle3",
-        "llama3_eagle",
-        "llama3_eagle3",
-        "llama4_eagle",
-        "llama4_eagle_mm",
-        "deepseek_eagle",
-    ],
 )
 @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
-def test_eagle_correctness(
+def test_eagle_correctness_heavy(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_setup: tuple[str, str, str, int],
+    mm_enabled: bool,
+    expected_accuracy_threshold: float,
+    enable_chunked_prefill: bool,
+    model_impl: str,
+    attn_backend: str,
+):
+    _run_eagle_correctness(
+        monkeypatch,
+        sampling_config,
+        model_setup,
+        mm_enabled,
+        expected_accuracy_threshold,
+        enable_chunked_prefill,
+        model_impl,
+        attn_backend,
+    )
+
+
+def _run_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     sampling_config: SamplingParams,
     model_setup: tuple[str, str, str, int],
@@ -460,14 +565,10 @@ def test_eagle_correctness(
     attn_backend: str,
 ):
     """
-    Compare the outputs of a original LLM and a speculative LLM
-    which should be the same when using eagle speculative decoding. Due to some variance
-    in the engine, it is possible for some outputs to differ, so we expect that at least
-    6/10 output tokens match exactly, and that the GSM8k accuracy is above
-    a precomputed reference threshold for each model.
+    Compare the outputs of an original LLM and a speculative LLM
+    which should be the same when using eagle speculative decoding.
     """
     if attn_backend == "TREE_ATTN":
-        # TODO: Fix this flaky test
         pytest.skip(
             "TREE_ATTN is flaky in the test disable for now until it can be "
             "resolved (see https://github.com/vllm-project/vllm/issues/22922)"
@@ -484,12 +585,8 @@ def test_eagle_correctness(
                 f"transformers>={required}, but got {installed}"
             )
 
-    # Generate test prompts inside the function instead of using fixture
     test_prompts = get_test_prompts(mm_enabled)
-    # Determine attention config
-    # Scout requires default backend selection because vision encoder has
-    # head_dim 88 being incompatible with FLASH_ATTN and needs to fall back
-    # to FLEX_ATTENTION
+
     if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
         if current_platform.is_rocm():
             print(
@@ -569,8 +666,6 @@ def test_eagle_correctness(
                 print(f"ref_output: {ref_output.outputs[0].text}")
                 print(f"spec_output: {spec_output.outputs[0].text}")
 
-        # Heuristic: expect at least 60% of the prompts to match exactly
-        # Upon failure, inspect the outputs to check for inaccuracy.
         assert matches > int(0.6 * len(ref_outputs))
         del spec_llm
         torch.cuda.empty_cache()
@@ -585,6 +680,8 @@ def test_eagle_correctness(
     ],
     ids=["mimo", "deepseek"],
 )
+@single_gpu_only
+@large_gpu_mark(min_gb=20)
 def test_mtp_correctness(
     monkeypatch: pytest.MonkeyPatch,
     sampling_config: SamplingParams,
@@ -700,11 +797,13 @@ class ArgsTest:
 
 @pytest.mark.parametrize("args", cases)
 @pytest.mark.parametrize("enforce_eager", [True, False])
+@single_gpu_only
 def test_draft_model_correctness(args: ArgsTest, enforce_eager: bool):
     args.enforce_eager = enforce_eager
     assert_draft_model_correctness(args)
 
 
+@single_gpu_only
 def test_draft_model_realistic_example():
     args = ArgsTest(
         target_model="Qwen/Qwen3-1.7B",
@@ -719,6 +818,7 @@ def test_draft_model_realistic_example():
     assert_draft_model_correctness(args)
 
 
+@single_gpu_only
 def test_draft_model_parallel_drafting():
     args = ArgsTest(
         target_model="Qwen/Qwen3-1.7B",
@@ -744,6 +844,7 @@ def test_draft_model_parallel_drafting():
     ids=["target_quantized", "draft_quantized"],
 )
 @pytest.mark.parametrize("enforce_eager", [True, False])
+@single_gpu_only
 def test_draft_model_quantization(models: tuple[str, str], enforce_eager: bool):
     tgt_model, draft_model = models
     sd_case = ArgsTest(
@@ -755,6 +856,7 @@ def test_draft_model_quantization(models: tuple[str, str], enforce_eager: bool):
     assert_draft_model_correctness(sd_case)
 
 
+@multi_gpu_only(num_gpus=2)
 def test_draft_model_tensor_parallelism():
     """Ensure spec decode works when running with TP > 1."""
     _skip_if_insufficient_gpus_for_tp(2)
@@ -770,6 +872,7 @@ def test_draft_model_tensor_parallelism():
     assert_draft_model_correctness(sd_case)
 
 
+@multi_gpu_only(num_gpus=2)
 def test_draft_model_engine_args_tensor_parallelism():
     """Ensure the vllm_config for the draft model is created correctly,
     and independently of the target model (quantization, TP, etc.)"""

From 757cebb3fdaa5a3b2e367472c88b3f49c53b423f Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Wed, 25 Feb 2026 12:11:24 -0600
Subject: [PATCH 4/4] Moved scheduler before the worklaod definitions

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 tests/v1/e2e/test_spec_decode.py | 236 +++++++++++++++----------------
 1 file changed, 118 insertions(+), 118 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index c6327e603aef..7f2db19a0750 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -331,6 +331,124 @@ def test_speculators_model_integration(
     )
 
 
+def _run_eagle_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_setup: tuple[str, str, str, int],
+    mm_enabled: bool,
+    expected_accuracy_threshold: float,
+    enable_chunked_prefill: bool,
+    model_impl: str,
+    attn_backend: str,
+):
+    """
+    Compare the outputs of an original LLM and a speculative LLM
+    which should be the same when using eagle speculative decoding.
+    """
+    if attn_backend == "TREE_ATTN":
+        pytest.skip(
+            "TREE_ATTN is flaky in the test disable for now until it can be "
+            "resolved (see https://github.com/vllm-project/vllm/issues/22922)"
+        )
+    if model_impl == "transformers":
+        import transformers
+        from packaging.version import Version
+
+        installed = Version(transformers.__version__)
+        required = Version("5.0.0")
+        if installed < required:
+            pytest.skip(
+                "Eagle3 with the Transformers modeling backend requires "
+                f"transformers>={required}, but got {installed}"
+            )
+
+    test_prompts = get_test_prompts(mm_enabled)
+
+    if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
+        if current_platform.is_rocm():
+            print(
+                "FLASH_ATTN for spec_decode not supported on "
+                "ROCm currently. Changing to FLEX_ATTENTION backend."
+            )
+            attention_config = {"backend": "FLEX_ATTENTION"}
+        else:
+            attention_config = None
+    else:
+        attention_config = {"backend": attn_backend}
+
+    if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
+        pytest.skip(
+            "TRITON_ATTN does not support "
+            "multi-token eagle spec decode on current platform"
+        )
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_MLA_DISABLE", "1")
+
+        if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
+            if "deepseek" in model_setup[1].lower():
+                m.setenv("VLLM_ROCM_USE_AITER", "1")
+                m.delenv("VLLM_MLA_DISABLE", raising=False)
+                attention_config = {"backend": "TRITON_MLA"}
+            else:
+                m.setenv("VLLM_ROCM_USE_AITER", "1")
+
+        method, model_name, spec_model_name, tp_size = model_setup
+        _skip_if_insufficient_gpus_for_tp(tp_size)
+
+        max_model_len = 2048
+        max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
+
+        ref_llm = LLM(
+            model=model_name,
+            max_model_len=max_model_len,
+            tensor_parallel_size=tp_size,
+            attention_config=attention_config,
+        )
+        evaluate_llm_for_gsm8k(
+            ref_llm, expected_accuracy_threshold=expected_accuracy_threshold
+        )
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+        spec_llm = LLM(
+            model=model_name,
+            trust_remote_code=True,
+            tensor_parallel_size=tp_size,
+            speculative_config={
+                "method": method,
+                "model": spec_model_name,
+                "num_speculative_tokens": 3,
+                "max_model_len": max_model_len,
+            },
+            max_model_len=max_model_len,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=enable_chunked_prefill,
+            model_impl=model_impl,
+            attention_config=attention_config,
+        )
+        evaluate_llm_for_gsm8k(
+            spec_llm, expected_accuracy_threshold=expected_accuracy_threshold
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        assert matches > int(0.6 * len(ref_outputs))
+        del spec_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+
 @single_gpu_only
 @pytest.mark.parametrize(
     [
@@ -554,124 +672,6 @@ def test_eagle_correctness_heavy(
     )
 
 
-def _run_eagle_correctness(
-    monkeypatch: pytest.MonkeyPatch,
-    sampling_config: SamplingParams,
-    model_setup: tuple[str, str, str, int],
-    mm_enabled: bool,
-    expected_accuracy_threshold: float,
-    enable_chunked_prefill: bool,
-    model_impl: str,
-    attn_backend: str,
-):
-    """
-    Compare the outputs of an original LLM and a speculative LLM
-    which should be the same when using eagle speculative decoding.
-    """
-    if attn_backend == "TREE_ATTN":
-        pytest.skip(
-            "TREE_ATTN is flaky in the test disable for now until it can be "
-            "resolved (see https://github.com/vllm-project/vllm/issues/22922)"
-        )
-    if model_impl == "transformers":
-        import transformers
-        from packaging.version import Version
-
-        installed = Version(transformers.__version__)
-        required = Version("5.0.0")
-        if installed < required:
-            pytest.skip(
-                "Eagle3 with the Transformers modeling backend requires "
-                f"transformers>={required}, but got {installed}"
-            )
-
-    test_prompts = get_test_prompts(mm_enabled)
-
-    if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
-        if current_platform.is_rocm():
-            print(
-                "FLASH_ATTN for spec_decode not supported on "
-                "ROCm currently. Changing to FLEX_ATTENTION backend."
-            )
-            attention_config = {"backend": "FLEX_ATTENTION"}
-        else:
-            attention_config = None
-    else:
-        attention_config = {"backend": attn_backend}
-
-    if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
-        pytest.skip(
-            "TRITON_ATTN does not support "
-            "multi-token eagle spec decode on current platform"
-        )
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_MLA_DISABLE", "1")
-
-        if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
-            if "deepseek" in model_setup[1].lower():
-                m.setenv("VLLM_ROCM_USE_AITER", "1")
-                m.delenv("VLLM_MLA_DISABLE", raising=False)
-                attention_config = {"backend": "TRITON_MLA"}
-            else:
-                m.setenv("VLLM_ROCM_USE_AITER", "1")
-
-        method, model_name, spec_model_name, tp_size = model_setup
-        _skip_if_insufficient_gpus_for_tp(tp_size)
-
-        max_model_len = 2048
-        max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
-
-        ref_llm = LLM(
-            model=model_name,
-            max_model_len=max_model_len,
-            tensor_parallel_size=tp_size,
-            attention_config=attention_config,
-        )
-        evaluate_llm_for_gsm8k(
-            ref_llm, expected_accuracy_threshold=expected_accuracy_threshold
-        )
-        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-        del ref_llm
-        torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()
-
-        spec_llm = LLM(
-            model=model_name,
-            trust_remote_code=True,
-            tensor_parallel_size=tp_size,
-            speculative_config={
-                "method": method,
-                "model": spec_model_name,
-                "num_speculative_tokens": 3,
-                "max_model_len": max_model_len,
-            },
-            max_model_len=max_model_len,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=enable_chunked_prefill,
-            model_impl=model_impl,
-            attention_config=attention_config,
-        )
-        evaluate_llm_for_gsm8k(
-            spec_llm, expected_accuracy_threshold=expected_accuracy_threshold
-        )
-        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-        matches = 0
-        misses = 0
-        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-            if ref_output.outputs[0].text == spec_output.outputs[0].text:
-                matches += 1
-            else:
-                misses += 1
-                print(f"ref_output: {ref_output.outputs[0].text}")
-                print(f"spec_output: {spec_output.outputs[0].text}")
-
-        assert matches > int(0.6 * len(ref_outputs))
-        del spec_llm
-        torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()
-
-
 @pytest.mark.parametrize(
     ["model_setup", "mm_enabled", "expected_accuracy_threshold"],
     [