From 64c9111b72938a99dcacd69dc04fe40dc1f3fc0a Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sun, 8 Mar 2026 22:54:48 -0500 Subject: [PATCH 1/2] [ROCm][CI] Retrying in case of batch variance effects and reducing flakiness Signed-off-by: Andreas Karatzas --- tests/v1/e2e/test_async_scheduling.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index c703d6aae9f9..0afd9c904fdd 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -6,7 +6,11 @@ import pytest import torch._dynamo.config as dynamo_config -from tests.utils import large_gpu_mark, single_gpu_only +from tests.utils import ( + ROCM_ENV_OVERRIDES, + large_gpu_mark, + single_gpu_only, +) from vllm import SamplingParams from vllm.logprobs import Logprob from vllm.platforms import current_platform @@ -154,6 +158,7 @@ def test_with_eagle3_spec_decoding(sample_json_schema, monkeypatch: pytest.Monke ) +@pytest.mark.flaky(reruns=2, only_on=current_platform.is_rocm()) def test_with_ngram_gpu_spec_decoding(monkeypatch: pytest.MonkeyPatch): """Test ngram_gpu speculative decoding with different configurations. @@ -207,7 +212,11 @@ def run_tests( with monkeypatch.context() as m: # lock matmul precision to full FP32 (IEEE) m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest") - # m.setenv("VLLM_BATCH_INVARIANT", "1") + # GFX950: On ROCm, disable skinny GEMM to avoid non-deterministic + # results from atomic reductions in wvSplitKrc kernel. + for key, value in ROCM_ENV_OVERRIDES.items(): + m.setenv(key, value) + outputs: list[tuple[str, list, list]] = [] for n, ( test_preemption, @@ -342,6 +351,7 @@ def run_test( speculative_config=spec_config, disable_log_stats=False, attention_config=attention_config, + enable_prefix_caching=False if current_platform.is_rocm() else None, **cache_arg, ) as vllm_model: results = [] From cd8f72922ea2867ce4cb94c7c3a7b3d38e455b8b Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sun, 15 Mar 2026 14:53:50 -0500 Subject: [PATCH 2/2] Removed env override for ROCm Signed-off-by: Andreas Karatzas --- tests/v1/e2e/general/test_async_scheduling.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/v1/e2e/general/test_async_scheduling.py b/tests/v1/e2e/general/test_async_scheduling.py index 9c93a4872c84..8e1eddb0f64e 100644 --- a/tests/v1/e2e/general/test_async_scheduling.py +++ b/tests/v1/e2e/general/test_async_scheduling.py @@ -8,7 +8,6 @@ import torch._dynamo.config as dynamo_config from tests.utils import ( - ROCM_ENV_OVERRIDES, large_gpu_mark, single_gpu_only, ) @@ -207,11 +206,6 @@ def run_tests( with monkeypatch.context() as m: # lock matmul precision to full FP32 (IEEE) m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest") - # GFX950: On ROCm, disable skinny GEMM to avoid non-deterministic - # results from atomic reductions in wvSplitKrc kernel. - for key, value in ROCM_ENV_OVERRIDES.items(): - m.setenv(key, value) - outputs: list[tuple[str, list, list]] = [] for n, ( test_preemption,