From 5207f29d43a43acdd3766704a05046b469d5eb67 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Wed, 25 Feb 2026 00:18:13 -0600 Subject: [PATCH 1/4] [ROCm][CI] Extending attention backend coverage for Eagle spec decode tests Signed-off-by: Andreas Karatzas --- tests/v1/e2e/test_spec_decode.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index a141e9da08a1..52a02beb6363 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -469,12 +469,17 @@ def test_eagle_correctness( # Determine attention config # Scout requires default backend selection because vision encoder has # head_dim 88 being incompatible with FLASH_ATTN and needs to fall back - # to Flex Attn - if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN": - if current_platform.is_rocm(): - # TODO: Enable Flex Attn for spec_decode on ROCm - pytest.skip("Flex Attn for spec_decode not supported on ROCm currently") - attention_config = None # Let it fall back to default + # to FLEX_ATTENTION + if ( + "Llama-4-Scout" in model_setup[1] + and attn_backend == "FLASH_ATTN" + and current_platform.is_rocm() + ): + print( + "FLASH_ATTN for spec_decode not supported on " + "ROCm currently. Changing to FLEX_ATTENTION backend." + ) + attention_config = {"backend": "FLEX_ATTENTION"} else: attention_config = {"backend": attn_backend} @@ -489,7 +494,9 @@ def test_eagle_correctness( if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): if "deepseek" in model_setup[1].lower(): - pytest.skip("ROCM_AITER_FA for deepseek not supported on ROCm platform") + m.setenv("VLLM_ROCM_USE_AITER", "1") + m.delenv("VLLM_MLA_DISABLE", raising=False) + attention_config = {"backend": "TRITON_MLA"} else: m.setenv("VLLM_ROCM_USE_AITER", "1") From 050936bdf56d473c659b104a4e7caba524cdaa41 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Wed, 25 Feb 2026 00:25:41 -0600 Subject: [PATCH 2/4] Fix Llama-4-Scout FLASH_ATTN handling: use FLEX_ATTENTION on ROCm, preserve None fallback on NVIDIA Signed-off-by: Andreas Karatzas --- tests/v1/e2e/test_spec_decode.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 52a02beb6363..68e3b2a64eb2 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -470,16 +470,15 @@ def test_eagle_correctness( # Scout requires default backend selection because vision encoder has # head_dim 88 being incompatible with FLASH_ATTN and needs to fall back # to FLEX_ATTENTION - if ( - "Llama-4-Scout" in model_setup[1] - and attn_backend == "FLASH_ATTN" - and current_platform.is_rocm() - ): - print( - "FLASH_ATTN for spec_decode not supported on " - "ROCm currently. Changing to FLEX_ATTENTION backend." - ) - attention_config = {"backend": "FLEX_ATTENTION"} + if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN": + if current_platform.is_rocm(): + print( + "FLASH_ATTN for spec_decode not supported on " + "ROCm currently. Changing to FLEX_ATTENTION backend." + ) + attention_config = {"backend": "FLEX_ATTENTION"} + else: + attention_config = None else: attention_config = {"backend": attn_backend} From c7d160c9787b22d98e00e7de28bce97473da9b30 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Wed, 25 Feb 2026 11:59:05 -0600 Subject: [PATCH 3/4] [ROCm][CI] Resolving extended v1 e2e test queue time Signed-off-by: Andreas Karatzas --- .buildkite/test_areas/engine.yaml | 2 +- tests/utils.py | 51 ++++++ tests/v1/e2e/test_async_scheduling.py | 4 + tests/v1/e2e/test_spec_decode.py | 223 +++++++++++++++++++------- 4 files changed, 219 insertions(+), 61 deletions(-) diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml index 4f2380592d9e..19cd91370e64 100644 --- a/.buildkite/test_areas/engine.yaml +++ b/.buildkite/test_areas/engine.yaml @@ -30,7 +30,7 @@ steps: - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py mirror: amd: - device: mi325_8 + device: mi325_1 depends_on: - image-build-amd commands: diff --git a/tests/utils.py b/tests/utils.py index 75d33e509528..4041c261788c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1327,6 +1327,57 @@ def wrapper(f: Callable[_P, None]) -> Callable[_P, None]: return wrapper +def gpu_tier_mark(*, min_gpus: int = 1, max_gpus: int | None = None): + """ + Mark a test to only run when the GPU count falls within [min_gpus, max_gpus]. + + Examples: + @gpu_tier_mark(min_gpus=2) # only on multi-GPU + @gpu_tier_mark(max_gpus=1) # only on single-GPU + @gpu_tier_mark(min_gpus=2, max_gpus=4) # 2-4 GPUs only + """ + gpu_count = cuda_device_count_stateless() + marks = [] + + if min_gpus > 1: + marks.append(pytest.mark.distributed(num_gpus=min_gpus)) + + reasons = [] + if gpu_count < min_gpus: + reasons.append(f"Need at least {min_gpus} GPUs (have {gpu_count})") + if max_gpus is not None and gpu_count > max_gpus: + reasons.append(f"Need at most {max_gpus} GPUs (have {gpu_count})") + + if reasons: + marks.append(pytest.mark.skipif(True, reason="; ".join(reasons))) + + return marks + + +def single_gpu_only(f=None): + """Skip this test when running in a multi-GPU environment.""" + marks = gpu_tier_mark(max_gpus=1) + + def wrapper(func): + for mark in reversed(marks): + func = mark(func) + return func + + return wrapper(f) if f is not None else wrapper + + +def multi_gpu_only(*, num_gpus: int = 2): + """Skip this test when running on fewer than num_gpus GPUs.""" + marks = gpu_tier_mark(min_gpus=num_gpus) + + def wrapper(f): + for mark in reversed(marks): + f = mark(f) + return f + + return wrapper + + async def completions_with_server_args( prompts: list[str], model_name: str, diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index b85f8880cf8e..393c8dbeecfe 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -6,6 +6,7 @@ import pytest import torch._dynamo.config as dynamo_config +from tests.utils import large_gpu_mark, single_gpu_only from vllm import SamplingParams from vllm.logprobs import Logprob from vllm.platforms import current_platform @@ -36,6 +37,7 @@ ) +@single_gpu_only def test_without_spec_decoding( sample_json_schema, monkeypatch: pytest.MonkeyPatch, @@ -95,6 +97,8 @@ def test_without_spec_decoding( run_tests(monkeypatch, MODEL, test_configs, test_sampling_params) +@single_gpu_only +@large_gpu_mark(min_gb=16) def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch): """Test consistency and acceptance rates with some different combos of preemption, executor, async scheduling, prefill chunking, diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index bf6638539fa3..c6327e603aef 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -9,7 +9,13 @@ import torch from tests.evals.gsm8k.gsm8k_eval import _build_gsm8k_prompts, evaluate_gsm8k_offline -from tests.utils import get_attn_backend_list_based_on_platform, large_gpu_mark +from tests.utils import ( + get_attn_backend_list_based_on_platform, + large_gpu_mark, + multi_gpu_marks, + multi_gpu_only, + single_gpu_only, +) from vllm import LLM, SamplingParams from vllm.assets.base import VLLM_S3_BUCKET_URL from vllm.assets.image import VLM_IMAGES_DIR @@ -160,6 +166,8 @@ def reset_torch_dynamo(): }, ], ) +@single_gpu_only +@large_gpu_mark(min_gb=20) def test_ngram_and_suffix_correctness( speculative_config: dict, model_name: str, @@ -175,6 +183,8 @@ def test_ngram_and_suffix_correctness( cleanup_dist_env_and_memory() +@single_gpu_only +@large_gpu_mark(min_gb=20) def test_suffix_decoding_acceptance( monkeypatch: pytest.MonkeyPatch, sampling_config: SamplingParams, @@ -242,6 +252,8 @@ def test_suffix_decoding_acceptance( ], ids=["llama3_eagle3_speculator", "qwen3_eagle3_speculator"], ) +@single_gpu_only +@large_gpu_mark(min_gb=24) def test_speculators_model_integration( monkeypatch: pytest.MonkeyPatch, sampling_config: SamplingParams, @@ -319,6 +331,56 @@ def test_speculators_model_integration( ) +@single_gpu_only +@pytest.mark.parametrize( + [ + "model_setup", + "mm_enabled", + "enable_chunked_prefill", + "model_impl", + "expected_accuracy_threshold", + ], + [ + ( + ( + "eagle", + "eagle618/deepseek-v3-random", + "eagle618/eagle-deepseek-v3-random", + 1, + ), + False, + False, + "auto", + 0.0, + ), + ], + ids=["deepseek_eagle"], +) +@pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) +def test_eagle_correctness_light( + monkeypatch: pytest.MonkeyPatch, + sampling_config: SamplingParams, + model_setup: tuple[str, str, str, int], + mm_enabled: bool, + expected_accuracy_threshold: float, + enable_chunked_prefill: bool, + model_impl: str, + attn_backend: str, +): + _run_eagle_correctness( + monkeypatch, + sampling_config, + model_setup, + mm_enabled, + expected_accuracy_threshold, + enable_chunked_prefill, + model_impl, + attn_backend, + ) + + +@single_gpu_only +@large_gpu_mark(min_gb=24) @pytest.mark.parametrize( [ "model_setup", @@ -333,14 +395,14 @@ def test_speculators_model_integration( False, False, "auto", - 0.8, # ref: 90% + 0.8, ), ( ("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False, "transformers", - 0.8, # ref: 90% + 0.8, ), pytest.param( ( @@ -352,7 +414,7 @@ def test_speculators_model_integration( False, False, "auto", - 0.8, # ref: 90% + 0.8, marks=pytest.mark.skip( reason="architecture of its eagle3 is LlamaForCausalLMEagle3" ), @@ -367,35 +429,77 @@ def test_speculators_model_integration( False, False, "auto", - 0.7, # TODO, update this with a reference value when re-enabling this case + 0.7, marks=pytest.mark.skip( - reason="Skipping due to its head_dim not being a a multiple of 32" + reason="Skipping due to its head_dim not being a multiple of 32" ), ), - pytest.param( + ( ( - "eagle", + "eagle3", "meta-llama/Llama-3.1-8B-Instruct", - "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", + "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1, ), False, - True, + False, "auto", - 0.7, # ref: 75%-80% - marks=large_gpu_mark(min_gb=40), - ), # works on 4x H100 - ( + 0.7, + ), + ], + ids=[ + "qwen3_eagle3", + "qwen3_eagle3-transformers", + "qwen3_vl_eagle3", + "qwen2_5_vl_eagle3", + "llama3_eagle3", + ], +) +@pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) +def test_eagle_correctness_medium( + monkeypatch: pytest.MonkeyPatch, + sampling_config: SamplingParams, + model_setup: tuple[str, str, str, int], + mm_enabled: bool, + expected_accuracy_threshold: float, + enable_chunked_prefill: bool, + model_impl: str, + attn_backend: str, +): + _run_eagle_correctness( + monkeypatch, + sampling_config, + model_setup, + mm_enabled, + expected_accuracy_threshold, + enable_chunked_prefill, + model_impl, + attn_backend, + ) + + +@pytest.mark.parametrize( + [ + "model_setup", + "mm_enabled", + "enable_chunked_prefill", + "model_impl", + "expected_accuracy_threshold", + ], + [ + pytest.param( ( - "eagle3", + "eagle", "meta-llama/Llama-3.1-8B-Instruct", - "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", + "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1, ), False, - False, + True, "auto", - 0.7, # ref: 75%-80% + 0.7, + marks=large_gpu_mark(min_gb=40), + id="llama3_eagle", ), pytest.param( ( @@ -407,9 +511,10 @@ def test_speculators_model_integration( False, False, "auto", - 0.8, # ref: 90% - # marks=large_gpu_mark(min_gb=80), - ), # works on 4x H100 + 0.8, + marks=multi_gpu_marks(num_gpus=4), + id="llama4_eagle", + ), pytest.param( ( "eagle", @@ -420,36 +525,36 @@ def test_speculators_model_integration( True, True, "auto", - 0.8, # ref: 90% - marks=large_gpu_mark(min_gb=80), - ), # works on 4x H100 - ( - ( - "eagle", - "eagle618/deepseek-v3-random", - "eagle618/eagle-deepseek-v3-random", - 1, - ), - False, - False, - "auto", - 0.0, # dummy model, skip gsm8k check + 0.8, + marks=[*multi_gpu_marks(num_gpus=4), large_gpu_mark(min_gb=80)], + id="llama4_eagle_mm", ), ], - ids=[ - "qwen3_eagle3", - "qwen3_eagle3-transformers", - "qwen3_vl_eagle3", - "qwen2_5_vl_eagle3", - "llama3_eagle", - "llama3_eagle3", - "llama4_eagle", - "llama4_eagle_mm", - "deepseek_eagle", - ], ) @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) -def test_eagle_correctness( +def test_eagle_correctness_heavy( + monkeypatch: pytest.MonkeyPatch, + sampling_config: SamplingParams, + model_setup: tuple[str, str, str, int], + mm_enabled: bool, + expected_accuracy_threshold: float, + enable_chunked_prefill: bool, + model_impl: str, + attn_backend: str, +): + _run_eagle_correctness( + monkeypatch, + sampling_config, + model_setup, + mm_enabled, + expected_accuracy_threshold, + enable_chunked_prefill, + model_impl, + attn_backend, + ) + + +def _run_eagle_correctness( monkeypatch: pytest.MonkeyPatch, sampling_config: SamplingParams, model_setup: tuple[str, str, str, int], @@ -460,14 +565,10 @@ def test_eagle_correctness( attn_backend: str, ): """ - Compare the outputs of a original LLM and a speculative LLM - which should be the same when using eagle speculative decoding. Due to some variance - in the engine, it is possible for some outputs to differ, so we expect that at least - 6/10 output tokens match exactly, and that the GSM8k accuracy is above - a precomputed reference threshold for each model. + Compare the outputs of an original LLM and a speculative LLM + which should be the same when using eagle speculative decoding. """ if attn_backend == "TREE_ATTN": - # TODO: Fix this flaky test pytest.skip( "TREE_ATTN is flaky in the test disable for now until it can be " "resolved (see https://github.com/vllm-project/vllm/issues/22922)" @@ -484,12 +585,8 @@ def test_eagle_correctness( f"transformers>={required}, but got {installed}" ) - # Generate test prompts inside the function instead of using fixture test_prompts = get_test_prompts(mm_enabled) - # Determine attention config - # Scout requires default backend selection because vision encoder has - # head_dim 88 being incompatible with FLASH_ATTN and needs to fall back - # to FLEX_ATTENTION + if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN": if current_platform.is_rocm(): print( @@ -569,8 +666,6 @@ def test_eagle_correctness( print(f"ref_output: {ref_output.outputs[0].text}") print(f"spec_output: {spec_output.outputs[0].text}") - # Heuristic: expect at least 60% of the prompts to match exactly - # Upon failure, inspect the outputs to check for inaccuracy. assert matches > int(0.6 * len(ref_outputs)) del spec_llm torch.cuda.empty_cache() @@ -585,6 +680,8 @@ def test_eagle_correctness( ], ids=["mimo", "deepseek"], ) +@single_gpu_only +@large_gpu_mark(min_gb=20) def test_mtp_correctness( monkeypatch: pytest.MonkeyPatch, sampling_config: SamplingParams, @@ -700,11 +797,13 @@ class ArgsTest: @pytest.mark.parametrize("args", cases) @pytest.mark.parametrize("enforce_eager", [True, False]) +@single_gpu_only def test_draft_model_correctness(args: ArgsTest, enforce_eager: bool): args.enforce_eager = enforce_eager assert_draft_model_correctness(args) +@single_gpu_only def test_draft_model_realistic_example(): args = ArgsTest( target_model="Qwen/Qwen3-1.7B", @@ -719,6 +818,7 @@ def test_draft_model_realistic_example(): assert_draft_model_correctness(args) +@single_gpu_only def test_draft_model_parallel_drafting(): args = ArgsTest( target_model="Qwen/Qwen3-1.7B", @@ -744,6 +844,7 @@ def test_draft_model_parallel_drafting(): ids=["target_quantized", "draft_quantized"], ) @pytest.mark.parametrize("enforce_eager", [True, False]) +@single_gpu_only def test_draft_model_quantization(models: tuple[str, str], enforce_eager: bool): tgt_model, draft_model = models sd_case = ArgsTest( @@ -755,6 +856,7 @@ def test_draft_model_quantization(models: tuple[str, str], enforce_eager: bool): assert_draft_model_correctness(sd_case) +@multi_gpu_only(num_gpus=2) def test_draft_model_tensor_parallelism(): """Ensure spec decode works when running with TP > 1.""" _skip_if_insufficient_gpus_for_tp(2) @@ -770,6 +872,7 @@ def test_draft_model_tensor_parallelism(): assert_draft_model_correctness(sd_case) +@multi_gpu_only(num_gpus=2) def test_draft_model_engine_args_tensor_parallelism(): """Ensure the vllm_config for the draft model is created correctly, and independently of the target model (quantization, TP, etc.)""" From 757cebb3fdaa5a3b2e367472c88b3f49c53b423f Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Wed, 25 Feb 2026 12:11:24 -0600 Subject: [PATCH 4/4] Moved scheduler before the worklaod definitions Signed-off-by: Andreas Karatzas --- tests/v1/e2e/test_spec_decode.py | 236 +++++++++++++++---------------- 1 file changed, 118 insertions(+), 118 deletions(-) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index c6327e603aef..7f2db19a0750 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -331,6 +331,124 @@ def test_speculators_model_integration( ) +def _run_eagle_correctness( + monkeypatch: pytest.MonkeyPatch, + sampling_config: SamplingParams, + model_setup: tuple[str, str, str, int], + mm_enabled: bool, + expected_accuracy_threshold: float, + enable_chunked_prefill: bool, + model_impl: str, + attn_backend: str, +): + """ + Compare the outputs of an original LLM and a speculative LLM + which should be the same when using eagle speculative decoding. + """ + if attn_backend == "TREE_ATTN": + pytest.skip( + "TREE_ATTN is flaky in the test disable for now until it can be " + "resolved (see https://github.com/vllm-project/vllm/issues/22922)" + ) + if model_impl == "transformers": + import transformers + from packaging.version import Version + + installed = Version(transformers.__version__) + required = Version("5.0.0") + if installed < required: + pytest.skip( + "Eagle3 with the Transformers modeling backend requires " + f"transformers>={required}, but got {installed}" + ) + + test_prompts = get_test_prompts(mm_enabled) + + if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN": + if current_platform.is_rocm(): + print( + "FLASH_ATTN for spec_decode not supported on " + "ROCm currently. Changing to FLEX_ATTENTION backend." + ) + attention_config = {"backend": "FLEX_ATTENTION"} + else: + attention_config = None + else: + attention_config = {"backend": attn_backend} + + if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): + pytest.skip( + "TRITON_ATTN does not support " + "multi-token eagle spec decode on current platform" + ) + + with monkeypatch.context() as m: + m.setenv("VLLM_MLA_DISABLE", "1") + + if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): + if "deepseek" in model_setup[1].lower(): + m.setenv("VLLM_ROCM_USE_AITER", "1") + m.delenv("VLLM_MLA_DISABLE", raising=False) + attention_config = {"backend": "TRITON_MLA"} + else: + m.setenv("VLLM_ROCM_USE_AITER", "1") + + method, model_name, spec_model_name, tp_size = model_setup + _skip_if_insufficient_gpus_for_tp(tp_size) + + max_model_len = 2048 + max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len + + ref_llm = LLM( + model=model_name, + max_model_len=max_model_len, + tensor_parallel_size=tp_size, + attention_config=attention_config, + ) + evaluate_llm_for_gsm8k( + ref_llm, expected_accuracy_threshold=expected_accuracy_threshold + ) + ref_outputs = ref_llm.chat(test_prompts, sampling_config) + del ref_llm + torch.cuda.empty_cache() + cleanup_dist_env_and_memory() + + spec_llm = LLM( + model=model_name, + trust_remote_code=True, + tensor_parallel_size=tp_size, + speculative_config={ + "method": method, + "model": spec_model_name, + "num_speculative_tokens": 3, + "max_model_len": max_model_len, + }, + max_model_len=max_model_len, + max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=enable_chunked_prefill, + model_impl=model_impl, + attention_config=attention_config, + ) + evaluate_llm_for_gsm8k( + spec_llm, expected_accuracy_threshold=expected_accuracy_threshold + ) + spec_outputs = spec_llm.chat(test_prompts, sampling_config) + matches = 0 + misses = 0 + for ref_output, spec_output in zip(ref_outputs, spec_outputs): + if ref_output.outputs[0].text == spec_output.outputs[0].text: + matches += 1 + else: + misses += 1 + print(f"ref_output: {ref_output.outputs[0].text}") + print(f"spec_output: {spec_output.outputs[0].text}") + + assert matches > int(0.6 * len(ref_outputs)) + del spec_llm + torch.cuda.empty_cache() + cleanup_dist_env_and_memory() + + @single_gpu_only @pytest.mark.parametrize( [ @@ -554,124 +672,6 @@ def test_eagle_correctness_heavy( ) -def _run_eagle_correctness( - monkeypatch: pytest.MonkeyPatch, - sampling_config: SamplingParams, - model_setup: tuple[str, str, str, int], - mm_enabled: bool, - expected_accuracy_threshold: float, - enable_chunked_prefill: bool, - model_impl: str, - attn_backend: str, -): - """ - Compare the outputs of an original LLM and a speculative LLM - which should be the same when using eagle speculative decoding. - """ - if attn_backend == "TREE_ATTN": - pytest.skip( - "TREE_ATTN is flaky in the test disable for now until it can be " - "resolved (see https://github.com/vllm-project/vllm/issues/22922)" - ) - if model_impl == "transformers": - import transformers - from packaging.version import Version - - installed = Version(transformers.__version__) - required = Version("5.0.0") - if installed < required: - pytest.skip( - "Eagle3 with the Transformers modeling backend requires " - f"transformers>={required}, but got {installed}" - ) - - test_prompts = get_test_prompts(mm_enabled) - - if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN": - if current_platform.is_rocm(): - print( - "FLASH_ATTN for spec_decode not supported on " - "ROCm currently. Changing to FLEX_ATTENTION backend." - ) - attention_config = {"backend": "FLEX_ATTENTION"} - else: - attention_config = None - else: - attention_config = {"backend": attn_backend} - - if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): - pytest.skip( - "TRITON_ATTN does not support " - "multi-token eagle spec decode on current platform" - ) - - with monkeypatch.context() as m: - m.setenv("VLLM_MLA_DISABLE", "1") - - if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): - if "deepseek" in model_setup[1].lower(): - m.setenv("VLLM_ROCM_USE_AITER", "1") - m.delenv("VLLM_MLA_DISABLE", raising=False) - attention_config = {"backend": "TRITON_MLA"} - else: - m.setenv("VLLM_ROCM_USE_AITER", "1") - - method, model_name, spec_model_name, tp_size = model_setup - _skip_if_insufficient_gpus_for_tp(tp_size) - - max_model_len = 2048 - max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len - - ref_llm = LLM( - model=model_name, - max_model_len=max_model_len, - tensor_parallel_size=tp_size, - attention_config=attention_config, - ) - evaluate_llm_for_gsm8k( - ref_llm, expected_accuracy_threshold=expected_accuracy_threshold - ) - ref_outputs = ref_llm.chat(test_prompts, sampling_config) - del ref_llm - torch.cuda.empty_cache() - cleanup_dist_env_and_memory() - - spec_llm = LLM( - model=model_name, - trust_remote_code=True, - tensor_parallel_size=tp_size, - speculative_config={ - "method": method, - "model": spec_model_name, - "num_speculative_tokens": 3, - "max_model_len": max_model_len, - }, - max_model_len=max_model_len, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=enable_chunked_prefill, - model_impl=model_impl, - attention_config=attention_config, - ) - evaluate_llm_for_gsm8k( - spec_llm, expected_accuracy_threshold=expected_accuracy_threshold - ) - spec_outputs = spec_llm.chat(test_prompts, sampling_config) - matches = 0 - misses = 0 - for ref_output, spec_output in zip(ref_outputs, spec_outputs): - if ref_output.outputs[0].text == spec_output.outputs[0].text: - matches += 1 - else: - misses += 1 - print(f"ref_output: {ref_output.outputs[0].text}") - print(f"spec_output: {spec_output.outputs[0].text}") - - assert matches > int(0.6 * len(ref_outputs)) - del spec_llm - torch.cuda.empty_cache() - cleanup_dist_env_and_memory() - - @pytest.mark.parametrize( ["model_setup", "mm_enabled", "expected_accuracy_threshold"], [