vllm-project · Gaohan123 · Mar 21, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
@@ -1,7 +1,7 @@
 steps:
 
 - label: "Diffusion Model Test"
-  timeout_in_minutes: 20
+  timeout_in_minutes: 30
   agent_pool: mi325_2
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
@@ -11,7 +11,7 @@ steps:
     - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
 
 - label: "Diffusion Images API LoRA E2E"
-  timeout_in_minutes: 20
+  timeout_in_minutes: 30
   agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]

@@ -17,7 +17,7 @@ steps:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
   - label: "Diffusion Model Test"
-    timeout_in_minutes: 20
+    timeout_in_minutes: 30
     depends_on: upload-merge-pipeline
     commands:
       - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "advanced_model and diffusion" --run-level "advanced_model"
@@ -35,7 +35,7 @@ steps:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
   - label: "Diffusion Images API LoRA E2E"
-    timeout_in_minutes: 20
+    timeout_in_minutes: 30
     depends_on: upload-merge-pipeline
     commands:
       - pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py

@@ -36,7 +36,7 @@ steps:
   - label: "Diffusion Model Test"
     depends_on: upload-ready-pipeline
     commands:
-      - timeout 20m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model"
+      - timeout 30m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model"
     agents:
       queue: "gpu_1_queue"
     plugins:

@@ -11,10 +11,29 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-# Install vllm-omni into the same uv-managed Python environment used by the base image.
-# Use bash -c so that $(python3 -c ...) is expanded inside the container.
-RUN uv pip install --system --no-cache-dir ".[dev]"
+RUN uv pip uninstall --system -y vllm || true
 
+# Install vLLM from precompiled wheel at the selected commit.
+# Must use direct URL because the wheel has a PEP 440 local version identifier
+# (e.g. +g0a0a1a198) which pip/uv refuse to install from a PEP 503 package index.
+ENV VLLM_PRECOMPILED_WHEEL_COMMIT=89138b21cc246ae944c741d5c399c148e2b770ab
+RUN VLLM_WHEEL_URL=$(python3 -c "import urllib.request,re; \
+    html=urllib.request.urlopen('https://wheels.vllm.ai/${VLLM_PRECOMPILED_WHEEL_COMMIT}/vllm/').read().decode(); \
+    m=re.search(r'>(\S+x86_64\.whl)<',html); \
+    print('https://wheels.vllm.ai/${VLLM_PRECOMPILED_WHEEL_COMMIT}/'+m.group(1).replace('+','%2B'))") && \
+    echo "Installing vLLM from: ${VLLM_WHEEL_URL}" && \
+    uv pip install --system --force-reinstall "${VLLM_WHEEL_URL}"
+
+RUN uv pip install --system ".[dev]"
+
+RUN uv pip install --system --upgrade \
+        "flashinfer-cubin==0.6.6" \
+        "nvidia-cublas-cu12==12.9.1.4" \
+        "numpy==2.2.6"
+
+RUN uv pip install --system --upgrade \
+    "flashinfer-jit-cache==0.6.6" \
+    --index-url https://flashinfer.ai/whl/cu129
 RUN ln -sf /usr/bin/python3 /usr/bin/python
 
 ENTRYPOINT []
@@ -68,18 +68,18 @@ We are keeping [issue #886](https://github.com/vllm-project/vllm-omni/issues/886
 You can also build vLLM-Omni from the latest main branch if you want to use the latest features or bug fixes. (But sometimes it will break for a while. You can check [issue #886](https://github.com/vllm-project/vllm-omni/issues/886) for the status of the latest commit of vLLM-Omni main branch on NPU.)
 
 ```bash
-# Pin vLLM version to 0.17.0
+# Pin vLLM version to 0.18.0
 cd /vllm-workspace/vllm
 git pull origin main
 git fetch origin --tags
-git checkout v0.17.0
+git checkout v0.18.0
 VLLM_TARGET_DEVICE=empty pip install -v -e .
 
 # Because vllm-ascend has not yet entered continuous development and has not been officially released, we need to pin it to a specific commit. Please note that this commit may change over time.
 cd /vllm-workspace/vllm-ascend
 git pull origin main
 git fetch origin --tags
-git checkout v0.17.0
+git checkout 1e05c4908f31737bc4eef865a9f351d030a77c9d
 pip install -v -e .
 
 # Install vLLM-Omni from the latest main branch

@@ -1910,6 +1910,7 @@ def generate_multimodal(
     def _cleanup_process(self):
         try:
             keywords = ["enginecore"]
+            matched = []
 
             for proc in psutil.process_iter(["pid", "name", "cmdline", "username"]):
                 try:
@@ -1922,16 +1923,32 @@ def _cleanup_process(self):
 
                     if is_process:
                         print(f"Found vllm process: PID={proc.pid}, cmd={cmdline[:100]}")
+                        matched.append(proc)
+                except (psutil.NoSuchProcess, psutil.AccessDenied):
+                    pass
 
-                        try:
-                            proc.terminate()
-                            time.sleep(2)
-                        except Exception:
-                            proc.kill()
+            for proc in matched:
+                try:
+                    proc.terminate()
+                except (psutil.NoSuchProcess, psutil.AccessDenied):
+                    pass
 
+            _, still_alive = psutil.wait_procs(matched, timeout=5)
+            for proc in still_alive:
+                try:
+                    proc.kill()
                 except (psutil.NoSuchProcess, psutil.AccessDenied):
                     pass
 
+            if still_alive:
+                _, stubborn = psutil.wait_procs(still_alive, timeout=3)
+                if stubborn:
+                    print(f"Warning: failed to kill residual vllm pids: {[p.pid for p in stubborn]}")
+                else:
+                    print(f"Force-killed residual vllm pids: {[p.pid for p in still_alive]}")
+            elif matched:
+                print(f"Terminated vllm pids: {[p.pid for p in matched]}")
+
         except Exception as e:
             print(f"Error in psutil vllm cleanup: {e}")
 

@@ -24,6 +24,7 @@
 # This test is specific to Z-Image LoRA behavior. Keep it focused on a single
 # model to reduce runtime and avoid extra downloads.
 models = ["Tongyi-MAI/Z-Image-Turbo"]
+DIFFUSION_INIT_TIMEOUT_S = 600
 
 
 @pytest.mark.parametrize("model_name", models)
@@ -76,7 +77,11 @@ def _write_zimage_lora(adapter_dir: Path) -> str:
         )
         return str(adapter_dir)
 
-    m = Omni(model=model_name)
+    m = Omni(
+        model=model_name,
+        stage_init_timeout=DIFFUSION_INIT_TIMEOUT_S,
+        init_timeout=DIFFUSION_INIT_TIMEOUT_S,
+    )
     try:
         # high resolution may cause OOM on L4
         height = 256

@@ -28,6 +28,7 @@
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
 MODEL = "Tongyi-MAI/Z-Image-Turbo"
+DIFFUSION_INIT_TIMEOUT_S = 600
 
 
 PROMPT = "a photo of a cat sitting on a laptop keyboard"
@@ -37,7 +38,17 @@
 
 @pytest.fixture(scope="module")
 def omni_server():
-    with OmniServer(MODEL, ["--num-gpus", "1"]) as server:
+    with OmniServer(
+        MODEL,
+        [
+            "--num-gpus",
+            "1",
+            "--stage-init-timeout",
+            str(DIFFUSION_INIT_TIMEOUT_S),
+            "--init-timeout",
+            str(DIFFUSION_INIT_TIMEOUT_S),
+        ],
+    ) as server:
         yield server
 
 

@@ -45,7 +45,7 @@ stage_args:
       max_model_len: 16384
       max_num_batched_tokens: 16384
       max_num_seqs: 1
-      gpu_memory_utilization: 0.9
+      gpu_memory_utilization: 0.4
       skip_mm_profiling: true
       enforce_eager: true
       trust_remote_code: true
@@ -72,7 +72,7 @@ stage_args:
       model_arch: Qwen2_5OmniForConditionalGeneration
       worker_type: generation
       scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
-      gpu_memory_utilization: 0.9 #increase the gpu memory utilization to enable the test on H800
+      gpu_memory_utilization: 0.5 #increase the gpu memory utilization to enable the test on H800
       enforce_eager: true
       trust_remote_code: true
       enable_prefix_caching: false

@@ -123,6 +123,7 @@ def make_mock_model(hidden: int = 8):
     cfg.video_token_index = VIDEO_TOKEN_ID
     cfg.audio_token_index = AUDIO_TOKEN_ID
     model.config = cfg
+    model._has_oov_mm_tokens = False
 
     def fake_lm_embed(ids: torch.Tensor) -> torch.Tensor:
         # Use .clone() so the tensor is contiguous (expand() creates a strided
@@ -137,13 +138,12 @@ def fake_lm_embed(ids: torch.Tensor) -> torch.Tensor:
 
     model._embed_text_input_ids = lambda *a, **kw: SupportsMultiModal._embed_text_input_ids(model, *a, **kw)
 
-    def fake_super_embed(ids, mm_embs=None, *, is_multimodal=None, handle_oov_mm_token=False):
+    def fake_super_embed(ids, mm_embs=None, *, is_multimodal=None):
         return SupportsMultiModal.embed_input_ids(
             model,
             ids,
             mm_embs,
             is_multimodal=is_multimodal,
-            handle_oov_mm_token=handle_oov_mm_token,
         )
 
     model.embed_input_ids = lambda *a, **kw: Qwen2_5OmniThinkerForConditionalGeneration.embed_input_ids(model, *a, **kw)

@@ -364,6 +364,7 @@ def update_from_output(
         if stopped_preempted_reqs:
             # This is a rare case and unlikely to impact performance.
             self.waiting.remove_requests(stopped_preempted_reqs)
+            self.skipped_waiting.remove_requests(stopped_preempted_reqs)
 
         # [Main] Handle failed KV load requests
         if failed_kv_load_req_ids and not self.recompute_kv_load_failures:

@@ -247,6 +247,15 @@ def schedule(self) -> SchedulerOutput:
         )
 
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
+
+        # Record the request ids scheduled in this step (v0.14.0 behavior).
+        self.prev_step_scheduled_req_ids.clear()
+        self.prev_step_scheduled_req_ids.update(num_scheduled_tokens.keys())
+
+        new_block_ids_to_zero = (
+            (self.kv_cache_manager.take_new_block_ids() or None) if self.needs_kv_cache_zeroing else None
+        )
+
         scheduler_output = SchedulerOutput(
             scheduled_new_reqs=new_reqs_data,
             scheduled_cached_reqs=cached_reqs_data,
@@ -258,12 +267,9 @@ def schedule(self) -> SchedulerOutput:
             finished_req_ids=self.finished_req_ids,
             free_encoder_mm_hashes=self.encoder_cache_manager.get_freed_mm_hashes(),
             preempted_req_ids=set(),
+            new_block_ids_to_zero=new_block_ids_to_zero,
         )
 
-        # Record the request ids scheduled in this step (v0.14.0 behavior).
-        self.prev_step_scheduled_req_ids.clear()
-        self.prev_step_scheduled_req_ids.update(num_scheduled_tokens.keys())
-
         # KVTransfer: package metadata
         if self.connector is not None:
             meta = self.connector.build_connector_meta(scheduler_output)
@@ -496,6 +502,7 @@ def update_from_output(
         if stopped_preempted_reqs:
             # This is a rare case and unlikely to impact performance.
             self.waiting.remove_requests(stopped_preempted_reqs)
+            self.skipped_waiting.remove_requests(stopped_preempted_reqs)
 
         # Handle failed KV load requests
         if failed_kv_load_req_ids and not self.recompute_kv_load_failures:

@@ -31,14 +31,15 @@ class DiffusionQuantizationConfig(ABC):
     # The underlying vLLM config instance
     _vllm_config: "QuantizationConfig | None" = None
 
-    def get_name(self) -> str:
+    @classmethod
+    def get_name(cls) -> str:
         """Return the quantization method name (e.g., 'fp8', 'int8').
 
-        By default, delegates to the underlying vLLM config instance.
+        Delegates to the underlying vLLM config class's get_name().
         """
-        if self._vllm_config is not None:
-            return self._vllm_config.get_name()
-        raise NotImplementedError("Subclass must initialize _vllm_config or override get_name().")
+        if cls.quant_config_cls is not None:
+            return cls.quant_config_cls.get_name()
+        raise NotImplementedError("Subclass must set quant_config_cls or override get_name().")
 
     def get_vllm_quant_config(self) -> "QuantizationConfig | None":
         """Return the underlying vLLM QuantizationConfig for linear layers."""

@@ -8,6 +8,7 @@
 
 from __future__ import annotations
 
+import importlib
 import logging
 import sys
 from dataclasses import dataclass
@@ -30,30 +31,23 @@ class PatchedRecvReqMeta:
 
 def _import_mooncake_module():
     """Import MooncakeConnector module, supporting both vLLM >=0.16 and older."""
-    try:
-        from vllm.distributed.kv_transfer.kv_connector.v1.mooncake import mooncake_connector
-
-        return mooncake_connector
-    except ImportError:
-        pass
-    try:
-        from vllm.distributed.kv_transfer.kv_connector.v1 import mooncake_connector
-
-        return mooncake_connector
-    except ImportError:
-        return None
+    for mod_path in (
+        "vllm.distributed.kv_transfer.kv_connector.v1.mooncake.mooncake_connector",
+        "vllm.distributed.kv_transfer.kv_connector.v1.mooncake_connector",
+    ):
+        try:
+            return importlib.import_module(mod_path)
+        except (ImportError, ModuleNotFoundError):
+            continue
+    return None
 
 
 def _create_patched_mooncake_connector():
     """Return a subclass of MooncakeConnector with remote_request_id support."""
-    try:
-        from vllm.distributed.kv_transfer.kv_connector.v1.mooncake.mooncake_connector import (
-            MooncakeConnector as _OriginalMooncakeConnector,
-        )
-    except (ImportError, AttributeError):
-        from vllm.distributed.kv_transfer.kv_connector.v1.mooncake_connector import (
-            MooncakeConnector as _OriginalMooncakeConnector,
-        )
+    _mc_mod = _import_mooncake_module()
+    if _mc_mod is None:
+        raise ImportError("Cannot import MooncakeConnector from upstream vLLM")
+    _OriginalMooncakeConnector = _mc_mod.MooncakeConnector
 
     class PatchedMooncakeConnector(_OriginalMooncakeConnector):
         """Fixes request-ID mismatch in PD disaggregation by injecting