vllm-project · tzhouam · May 2, 2026 · May 2, 2026 · May 3, 2026 · May 3, 2026
@@ -58,10 +58,12 @@ steps:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
   - label: "Diffusion Model CPU offloading Test"
-    timeout_in_minutes: 20
+    timeout_in_minutes: 30
     depends_on: upload-merge-pipeline
     commands:
       # Single pytest session for one combined summary at end of log.
+      # 30 min: two files (cpu_offload + layerwise_offload), each ~10-12 min
+      # with model download; main branch consistently finishes in ~18-20 min.
       - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU

@@ -1,5 +1,5 @@
 ARG VLLM_BASE_IMAGE=vllm/vllm-openai
-ARG VLLM_BASE_TAG=v0.20.0
+ARG VLLM_BASE_TAG=v0.20.0-cu130
 FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG}
 ARG APP_DIR=/workspace/vllm-omni
 WORKDIR ${APP_DIR}
@@ -11,8 +11,35 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+
+# Install vLLM (CUDA 13.0 build) from precompiled wheel at the selected commit.
+# Must use direct URL because the wheel has a PEP 440 local version identifier
+# (e.g. +gaeee7ef93.cu130) which pip/uv refuse to install from a PEP 503 package index.
+# The cu130 wheel index lives at /<commit>/cu130/vllm/ but the actual wheel files
+# are stored at the top level /<commit>/<wheel>.
+ENV VLLM_PRECOMPILED_WHEEL_COMMIT=1acd67a795ebccdf9b9db7697ae9082058301657
+RUN VLLM_WHEEL_URL=$(python3 -c "import urllib.request,re; \
+    from urllib.parse import urljoin, quote; \
+    index_url='https://wheels.vllm.ai/${VLLM_PRECOMPILED_WHEEL_COMMIT}/cu130/vllm/'; \
+    html=urllib.request.urlopen(index_url).read().decode(); \
+    m=re.search(r'href=\"([^\"]*x86_64\.whl)\"',html); \
+    assert m, f'No x86_64 vLLM wheel found at {index_url}'; \
+    print(quote(urljoin(index_url,m.group(1)), safe=':/+%'))") && \
+    echo "Installing vLLM from: ${VLLM_WHEEL_URL}" && \
+    uv pip install --system --force-reinstall "${VLLM_WHEEL_URL}"
+
+
+
 RUN uv pip install --system ".[dev]"
 
+RUN uv pip install --system --upgrade \
+        "flashinfer-python==0.6.7" \
+        "flashinfer-cubin==0.6.7" \
+        "numpy==2.2.6"
+
+RUN uv pip install --system --upgrade \
+    "flashinfer-jit-cache==0.6.7" \
+    --index-url https://flashinfer.ai/whl/cu130
 RUN ln -sf /usr/bin/python3 /usr/bin/python
 
 ENTRYPOINT []
@@ -1,6 +1,7 @@
 import json
 import os
 import subprocess
+import sys
 import textwrap
 from pathlib import Path
 
@@ -88,7 +89,9 @@ def _write_calls():
     env["VLLM_OMNI_TEST_POST_CALLS_FILE"] = str(calls_path)
 
     cmd = [
-        "vllm",
+        sys.executable,
+        "-m",
+        "vllm_omni.entrypoints.cli.main",
         "bench",
         "serve",
         "--omni",

@@ -485,6 +485,7 @@ def test_generation_scheduler_calls_cleanup_on_finished(monkeypatch, mocker: Moc
         kv_connector_output=None,
         cudagraph_stats=None,
         req_id_to_index={"req-s1": 0},
+        routed_experts_dict=None,
     )
 
     OmniGenerationScheduler.update_from_output(scheduler, scheduler_output, model_runner_output)
@@ -570,6 +571,7 @@ def test_ar_scheduler_defers_cleanup_and_queues_save_on_finished(mocker: MockerF
         cudagraph_stats=None,
         req_id_to_index={"req-ar": 0},
         kv_extracted_req_ids=None,
+        routed_experts_dict=None,
     )
 
     OmniARScheduler.update_from_output(scheduler, scheduler_output, model_runner_output)

@@ -25,7 +25,11 @@
 )
 
 # All tests in this file require Mooncake TransferEngine and an RDMA environment.
-pytestmark = [pytest.mark.parallel, pytest.mark.core_model]
+pytestmark = [
+    pytest.mark.parallel,
+    pytest.mark.core_model,
+    pytest.mark.skipif(TransferEngine is None, reason="Mooncake TransferEngine not installed"),
+]
 
 # ---------------------------------------------------------------------------
 # Shared helpers

@@ -21,6 +21,36 @@
 
 MODELS = {**AUDIO_MODEL, **IMAGE_VIDEO_MODELS}
 
+_GATED_MODELS = {"stabilityai/stable-audio-open-1.0"}
+
+
+def _skip_if_gated_repo_inaccessible(repo_id: str) -> None:
+    """Skip the test if a gated HuggingFace repo is not accessible.
+
+    Tries to download the model's config.json via ``hf_hub_download``,
+    which performs an actual file-access check (unlike ``HfApi().model_info()``
+    that only checks metadata).  If the token has metadata access but not
+    file-download access, ``hf_hub_download`` will raise ``GatedRepoError``
+    and we skip cleanly.
+    """
+    try:
+        from huggingface_hub import hf_hub_download
+        from huggingface_hub.errors import GatedRepoError, RepositoryNotFoundError
+    except Exception:
+        return
+    try:
+        hf_hub_download(repo_id=repo_id, filename="config.json")
+    except GatedRepoError as exc:
+        pytest.skip(
+            f"Skipping: gated HF repo {repo_id!r} inaccessible to the current "
+            f"HF_TOKEN ({exc}). See docs/contributing/ci/hf_credentials.md."
+        )
+    except RepositoryNotFoundError as exc:
+        pytest.skip(f"Skipping: HF repo {repo_id!r} not found ({exc}).")
+    except Exception:
+        return
+
+
 AUDIO_MODEL_PARAMS = {
     "runner_params": {},
     "sampler_params": {},
@@ -94,10 +124,23 @@ def check_audio_determinism(audio1, audio2, atol=1e-2):
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"})
 @pytest.mark.parametrize("model_name", list(MODELS.keys()))
 def test_cpu_offload_diffusion_model(model_name: str):
+    if model_name in _GATED_MODELS:
+        _skip_if_gated_repo_inaccessible(model_name)
     try:
         offload_peak_memory, output_offload = inference(model_name, offload=True)
         cleanup_dist_env_and_memory()
         no_offload_peak_memory, output_no_offload = inference(model_name, offload=False)
+    except ValueError as exc:
+        # omni_snapshot_download wraps GatedRepoError in a ValueError.
+        # If the pre-flight guard above did not catch it (e.g. partial
+        # HF_TOKEN where config.json is accessible but weight shards are
+        # blocked), skip instead of failing.
+        if "Access to model" in str(exc) and "is restricted" in str(exc):
+            pytest.skip(
+                f"Skipping: gated HF repo {model_name!r} inaccessible "
+                f"({exc}). See docs/contributing/ci/hf_credentials.md."
+            )
+        pytest.fail(f"Inference failed: {exc}")
     except Exception:
         pytest.fail("Inference failed")
     print(f"Offload peak memory: {offload_peak_memory} MB")

@@ -2,7 +2,11 @@
 import torch
 from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
 
-from tests.e2e.offline_inference.test_diffusion_cpu_offload import check_audio_determinism
+from tests.e2e.offline_inference.test_diffusion_cpu_offload import (
+    _GATED_MODELS,
+    _skip_if_gated_repo_inaccessible,
+    check_audio_determinism,
+)
 from tests.helpers.env import DeviceMemoryMonitor
 from tests.helpers.runtime import OmniRunner
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
@@ -81,6 +85,8 @@ def test_layerwise_offload_diffusion_model(model_name: str):
     offloader keeps only a single transformer block on GPU at a time, with
     prefetching for compute-memory overlap.
     """
+    if model_name in _GATED_MODELS:
+        _skip_if_gated_repo_inaccessible(model_name)
     try:
         # Run without layerwise offloading (baseline)
         no_offload_peak_memory, output_no_offload = run_inference(model_name, layerwise_offload=False)
@@ -89,6 +95,17 @@ def test_layerwise_offload_diffusion_model(model_name: str):
         # Run with layerwise offloading (1 layer on device)
         layerwise_offload_peak_memory, output_offload = run_inference(model_name, layerwise_offload=True)
         cleanup_dist_env_and_memory()
+    except ValueError as exc:
+        # omni_snapshot_download wraps GatedRepoError in a ValueError.
+        # If the pre-flight guard above did not catch it (e.g. partial
+        # HF_TOKEN where config.json is accessible but weight shards are
+        # blocked), skip instead of failing.
+        if "Access to model" in str(exc) and "is restricted" in str(exc):
+            pytest.skip(
+                f"Skipping: gated HF repo {model_name!r} inaccessible "
+                f"({exc}). See docs/contributing/ci/hf_credentials.md."
+            )
+        pytest.fail(f"Inference failed: {exc}")
     except Exception:
         pytest.fail("Inference failed")
 

@@ -91,7 +91,8 @@ def _build_request(
 def _collect_audio(omni: Omni, request: dict) -> tuple[torch.Tensor, int]:
     """Run a single request and return (waveform, sample_rate)."""
     for stage_outputs in omni.generate(request, DEFAULT_SAMPLING):
-        for req_output in stage_outputs.request_output:
+        req_output = stage_outputs.request_output
+        if req_output is not None:
             mm = req_output.outputs[0].multimodal_output
             assert mm is not None, "Expected multimodal_output to be non-None"
             audio = mm.get("audio")
@@ -108,6 +109,7 @@ def omni_engine():
     return Omni(model=MODEL_NAME, stage_init_timeout=180)
 
 
+@pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"})
 def test_moss_tts_nano_english(omni_engine, ref_audio_path):
@@ -120,6 +122,7 @@ def test_moss_tts_nano_english(omni_engine, ref_audio_path):
     assert not torch.all(audio == 0), "Audio should not be all-zeros (silence)"
 
 
+@pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"})
 def test_moss_tts_nano_chinese(omni_engine, ref_audio_path):
@@ -132,6 +135,7 @@ def test_moss_tts_nano_chinese(omni_engine, ref_audio_path):
     assert not torch.all(audio == 0)
 
 
+@pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"})
 def test_moss_tts_nano_deterministic(omni_engine, ref_audio_path):
@@ -144,6 +148,7 @@ def test_moss_tts_nano_deterministic(omni_engine, ref_audio_path):
     assert torch.allclose(audio1, audio2, atol=1e-4), "Waveforms should match with same seed"
 
 
+@pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"})
 def test_moss_tts_nano_batch(omni_engine, ref_audio_path):
@@ -153,8 +158,10 @@ def test_moss_tts_nano_batch(omni_engine, ref_audio_path):
         _build_request("Second request.", ref_audio_path),
     ]
     results = []
-    for stage_outputs in omni_engine.generate(requests, [DEFAULT_SAMPLING] * 2):
-        for req_output in stage_outputs.request_output:
+    # Single-stage model (num_stages=1): one sampling param for all requests.
+    for stage_outputs in omni_engine.generate(requests, [DEFAULT_SAMPLING]):
+        req_output = stage_outputs.request_output
+        if req_output is not None:
             mm = req_output.outputs[0].multimodal_output
             assert mm is not None
             results.append(mm["audio"].cpu())

@@ -24,6 +24,36 @@
 
 pytestmark = [pytest.mark.full_model, pytest.mark.diffusion]
 
+_MODEL_REPO = "stabilityai/stable-audio-open-1.0"
+
+
+def _skip_if_gated_repo_inaccessible(repo_id: str) -> None:
+    """Skip the test if a gated HuggingFace repo is not accessible.
+
+    Tries to download the model's config.json via ``hf_hub_download``,
+    which performs an actual file-access check (unlike ``HfApi().model_info()``
+    that only checks metadata).  If the token has metadata access but not
+    file-download access, ``hf_hub_download`` will raise ``GatedRepoError``
+    and we skip cleanly.
+    """
+    try:
+        from huggingface_hub import hf_hub_download
+        from huggingface_hub.errors import GatedRepoError, RepositoryNotFoundError
+    except Exception:
+        return
+    try:
+        hf_hub_download(repo_id=repo_id, filename="config.json")
+    except GatedRepoError as exc:
+        pytest.skip(
+            f"Skipping: gated HF repo {repo_id!r} inaccessible to the current "
+            f"HF_TOKEN ({exc}). See docs/contributing/ci/hf_credentials.md."
+        )
+    except RepositoryNotFoundError as exc:
+        pytest.skip(f"Skipping: HF repo {repo_id!r} not found ({exc}).")
+    except Exception:
+        return
+
+
 _SAMPLE_RATE = 44100
 _CLIP_DURATION_S = 2.0
 
@@ -77,6 +107,7 @@ def test_stable_audio_quantization_and_teacache() -> None:
 
     CI should provide ``HF_TOKEN`` if the checkpoint is gated.
     """
+    _skip_if_gated_repo_inaccessible(_MODEL_REPO)
     m = Omni(
         model="stabilityai/stable-audio-open-1.0",
         quantization="fp8",

@@ -42,24 +42,13 @@ def get_prompt(prompt_type="zh"):
 
 
 tts_server_params = [
-    pytest.param(
-        OmniServerParams(
-            model=MODEL,
-            stage_config_path=get_stage_config(),
-            server_args=["--trust-remote-code", "--disable-log-stats", "--no-async-chunk"],
-        ),
-        id="cosyvoice3",
-    )
-]
-
-tts_async_chunk_server_params = [
     pytest.param(
         OmniServerParams(
             model=MODEL,
             stage_config_path=get_stage_config(),
             server_args=["--trust-remote-code", "--disable-log-stats"],
         ),
-        id="cosyvoice3_async_chunk",
+        id="cosyvoice3",
     )
 ]
 
@@ -91,7 +80,7 @@ def test_voice_clone_zh_001(omni_server, openai_client) -> None:
 @pytest.mark.core_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100"}, num_cards=1)
-@pytest.mark.parametrize("omni_server", tts_async_chunk_server_params, indirect=True)
+@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
 def test_voice_clone_zh_002(omni_server, openai_client) -> None:
     """
     Test voice cloning TTS with Chinese text via async_chunk streaming.

@@ -401,7 +401,7 @@ def children(self, recursive=True):
             assert recursive
             return [fake_child]
 
-    def fail_base_shutdown(self):
+    def fail_base_shutdown(self, **_kwargs):
         raise RuntimeError("base shutdown failed")
 
     monkeypatch.setattr(psutil, "Process", FakePsutilProcess)
@@ -433,7 +433,7 @@ def children(self, recursive=True):
 
     monkeypatch.setattr(psutil, "Process", FakePsutilProcess)
     monkeypatch.setattr(psutil, "wait_procs", lambda procs, timeout: ([], list(procs)))
-    monkeypatch.setattr(AsyncMPClient, "shutdown", lambda self: None)
+    monkeypatch.setattr(AsyncMPClient, "shutdown", lambda self, **kwargs: None)
 
     client = object.__new__(StageEngineCoreClient)
     client._proc = fake_proc

@@ -389,7 +389,7 @@ async def test_no_output_returns_error(self):
 
         async def empty_gen(*args, **kwargs):
             return
-            yield  # noqa: unreachable – makes this an async generator
+            yield  # unreachable – makes this an async generator
 
         engine.generate = MagicMock(side_effect=empty_gen)
         server = _make_server(engine_client=engine)
@@ -448,7 +448,7 @@ async def test_value_error_returns_error_response(self):
 
         async def gen_value_error(*args, **kwargs):
             raise ValueError("bad value")
-            yield  # noqa: unreachable
+            yield  # unreachable
 
         engine.generate = MagicMock(side_effect=gen_value_error)
         server = _make_server(engine_client=engine)
@@ -463,7 +463,7 @@ async def test_generic_exception_returns_error_response(self):
 
         async def gen_runtime_error(*args, **kwargs):
             raise RuntimeError("something went wrong")
-            yield  # noqa: unreachable
+            yield  # unreachable
 
         engine.generate = MagicMock(side_effect=gen_runtime_error)
         server = _make_server(engine_client=engine)

@@ -35,6 +35,7 @@ def _make_runner(multimodal_outputs):
     runner.device = torch.device("cpu")
     runner.supports_mm_inputs = False
     runner.speculative_config = None
+    runner.routed_experts_initialized = False
     return runner