From 64c9111b72938a99dcacd69dc04fe40dc1f3fc0a Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sun, 8 Mar 2026 22:54:48 -0500
Subject: [PATCH 1/2] [ROCm][CI] Retrying in case of batch variance effects and
 reducing flakiness

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 tests/v1/e2e/test_async_scheduling.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index c703d6aae9f9..0afd9c904fdd 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -6,7 +6,11 @@
 import pytest
 import torch._dynamo.config as dynamo_config
 
-from tests.utils import large_gpu_mark, single_gpu_only
+from tests.utils import (
+    ROCM_ENV_OVERRIDES,
+    large_gpu_mark,
+    single_gpu_only,
+)
 from vllm import SamplingParams
 from vllm.logprobs import Logprob
 from vllm.platforms import current_platform
@@ -154,6 +158,7 @@ def test_with_eagle3_spec_decoding(sample_json_schema, monkeypatch: pytest.Monke
     )
 
 
+@pytest.mark.flaky(reruns=2, only_on=current_platform.is_rocm())
 def test_with_ngram_gpu_spec_decoding(monkeypatch: pytest.MonkeyPatch):
     """Test ngram_gpu speculative decoding with different configurations.
 
@@ -207,7 +212,11 @@ def run_tests(
     with monkeypatch.context() as m:
         # lock matmul precision to full FP32 (IEEE)
         m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
-        # m.setenv("VLLM_BATCH_INVARIANT", "1")
+        # GFX950: On ROCm, disable skinny GEMM to avoid non-deterministic
+        # results from atomic reductions in wvSplitKrc kernel.
+        for key, value in ROCM_ENV_OVERRIDES.items():
+            m.setenv(key, value)
+
         outputs: list[tuple[str, list, list]] = []
         for n, (
             test_preemption,
@@ -342,6 +351,7 @@ def run_test(
         speculative_config=spec_config,
         disable_log_stats=False,
         attention_config=attention_config,
+        enable_prefix_caching=False if current_platform.is_rocm() else None,
         **cache_arg,
     ) as vllm_model:
         results = []

From cd8f72922ea2867ce4cb94c7c3a7b3d38e455b8b Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sun, 15 Mar 2026 14:53:50 -0500
Subject: [PATCH 2/2] Removed env override for ROCm

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 tests/v1/e2e/general/test_async_scheduling.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/v1/e2e/general/test_async_scheduling.py b/tests/v1/e2e/general/test_async_scheduling.py
index 9c93a4872c84..8e1eddb0f64e 100644
--- a/tests/v1/e2e/general/test_async_scheduling.py
+++ b/tests/v1/e2e/general/test_async_scheduling.py
@@ -8,7 +8,6 @@
 import torch._dynamo.config as dynamo_config
 
 from tests.utils import (
-    ROCM_ENV_OVERRIDES,
     large_gpu_mark,
     single_gpu_only,
 )
@@ -207,11 +206,6 @@ def run_tests(
     with monkeypatch.context() as m:
         # lock matmul precision to full FP32 (IEEE)
         m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
-        # GFX950: On ROCm, disable skinny GEMM to avoid non-deterministic
-        # results from atomic reductions in wvSplitKrc kernel.
-        for key, value in ROCM_ENV_OVERRIDES.items():
-            m.setenv(key, value)
-
         outputs: list[tuple[str, list, list]] = []
         for n, (
             test_preemption,