vllm-project · divakar-amd · Nov 23, 2025
@@ -16,3 +16,4 @@ runai-model-streamer[s3,gcs]==0.15.0
 conch-triton-kernels==1.2.1
 timm>=1.0.17
 fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
+xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@eafd4db51b78acc64b3f0764ef27dfd206c28628
diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
@@ -8,6 +8,7 @@
 
 from vllm import SamplingParams
 from vllm.logprobs import Logprob
+from vllm.platforms import current_platform
 from vllm.sampling_params import StructuredOutputsParams
 from vllm.v1.metrics.reader import Metric
 
@@ -117,8 +118,10 @@ def run_tests(
     uni/multiproc executor with spec decoding."""
 
     with monkeypatch.context() as m:
-        # avoid precision errors
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+        # Currently ROCm doesn't support FLEX_ATTENTION for spec decoding
+        if not current_platform.is_rocm():
+            # avoid precision errors
+            m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
         # m.setenv("VLLM_BATCH_INVARIANT", "1")
         outputs: list[tuple[str, list, list]] = []
         for n, (