vllm-project · hsliuustc0106 · Apr 25, 2026 · Apr 22, 2026 · Apr 25, 2026 · hsliuustc0106
@@ -19,14 +19,17 @@ The new deploy schema lives under `vllm_omni/deploy/` and is paired with a froze
 | `platforms` | dict | optional | `null` | Keyed by `npu` / `rocm` / `xpu`, each contains a `stages:` list with per-platform overrides applied on top of the CUDA defaults. |
 | `pipeline` | str | optional | `null` | Override the auto-detected pipeline registry key (used for structural variants like `qwen2_5_omni_thinker_only`). |
 | `trust_remote_code` | bool | optional | `true` | **Pipeline-wide.** Trust HF remote code on model load; applies to every stage. |
-| `distributed_executor_backend` | str | optional | `"mp"` | **Pipeline-wide.** Executor backend (`"mp"` or `"ray"`). |
+| `distributed_executor_backend` | str \| null | optional | `null` | **Pipeline-wide.** Distributed executor backend forwarded to vLLM (`"mp"`, `"ray"`, `"external_launcher"`). If omitted, vLLM auto-selects backend from runtime topology. |
 | `dtype` | str \| null | optional | `null` | **Pipeline-wide.** Model dtype for every stage. |
 | `quantization` | str \| null | optional | `null` | **Pipeline-wide.** Quantization method for every stage. |
 | `enable_prefix_caching` | bool | optional | `false` | **Pipeline-wide.** Prefix cache toggle applied to every stage. |
 | `enable_chunked_prefill` | bool \| null | optional | `null` | **Pipeline-wide.** Chunked prefill toggle applied to every stage. |
 | `data_parallel_size` | int | optional | `1` | **Pipeline-wide.** DP degree for every stage. |
 | `pipeline_parallel_size` | int | optional | `1` | **Pipeline-wide.** PP degree for every stage. |
 
+Note: for diffusion path, `distributed_executor_backend` currently defaults to
+`mp`, and `ray` / `external_launcher` are not fully supported yet.
+
 ### Stage fields
 
 Each entry under `stages:` accepts any `StageDeployConfig` field directly (no nested `engine_args:`). Only fields whose value legitimately varies across stages live here; pipeline-wide settings (trust_remote_code, distributed_executor_backend, dtype, quantization, prefix/chunked prefill, DP/PP sizes) are declared at the top level and applied to every stage. Unknown keys fall through to `engine_extras:` and are forwarded to the engine.

@@ -340,7 +340,7 @@ def build_arg_parser() -> argparse.ArgumentParser:
     p.add_argument(
         "--max-seed-tts-mean-wer",
         type=float,
-        default=0.02,
+        default=0.5,
         help="If set, fail when seed_tts_content_error_mean is strictly above this value.",
     )
     p.add_argument(

@@ -439,7 +439,7 @@ class DeployConfig:
 
     # === Pipeline-wide engine settings (applied uniformly to every stage) ===
     trust_remote_code: bool = True
-    distributed_executor_backend: str = "mp"
+    distributed_executor_backend: str | None = None
     dtype: str | None = None
     quantization: str | None = None
     enable_prefix_caching: bool = False

@@ -22,6 +22,10 @@ class DiffusionExecutor(ABC):
     def get_class(od_config: OmniDiffusionConfig) -> type[DiffusionExecutor]:
         executor_class: type[DiffusionExecutor]
         distributed_executor_backend = od_config.distributed_executor_backend
+        # Keep backward-compatible behavior for callers/configs that omit this
+        # field and rely on the historical diffusion default backend.
+        if distributed_executor_backend is None:
+            distributed_executor_backend = "mp"
 
         if isinstance(distributed_executor_backend, type):
             if not issubclass(distributed_executor_backend, DiffusionExecutor):

@@ -7,7 +7,6 @@
 import types
 import weakref
 from collections.abc import Sequence
-from pprint import pformat
 from typing import TYPE_CHECKING, Any, Literal
 
 import huggingface_hub
@@ -250,8 +249,6 @@ def _log_summary_and_cleanup(self, request_id: str) -> None:
         try:
             if req_state is None or req_state.metrics is None:
                 return
-            summary = req_state.metrics.build_and_log_summary()
-            logger.info("[Summary] %s", pformat(summary, sort_dicts=False))
         except Exception:
             logger.exception(
                 "[%s] Failed to build/log summary for req=%s",